Navigate to directory that houses datasets. In terminal, paste the following lines to import the datasets:  
`mongoimport --db='influencers' --collection='instagram' --file=instagram_data_all-countries.csv  --drop --type=csv --headerline`  
`mongoimport --db='influencers' --collection='threads' --file=threads_data_all-countries.csv  --drop --type=csv --headerline`  
`mongoimport --db='influencers' --collection='tiktok' --file=tiktok_data_all-countries.csv  --drop --type=csv --headerline`  
`mongoimport --db='influencers' --collection='youtube' --file=youtube_data_all-countries.csv  --drop --type=csv --headerline`  

In [12]:
# import dependencies
from pymongo import MongoClient
import pandas as pd

In [13]:
# connect to Mongo
client = MongoClient(port=27017)  
db = client['influencers']

In [20]:
collections = ['instagram', 'threads', 'youtube', 'tiktok']

In [18]:
for collection_name in collections:
    
    collection = db[collection_name]
    
    
    cursor = collection.find()
    df = pd.DataFrame(list(cursor))

In [22]:
# create pandas dataframe from collections, add column social_platform
instagram = db['instagram'].find()
instagram_df = pd.DataFrame(list(instagram))
instagram_df['social_platform'] = 'Instagram'

threads = db['threads'].find()
threads_df = pd.DataFrame(list(threads))
threads_df['social_platform'] = 'Threads'

tiktok = db['tiktok'].find()
tiktok_df = pd.DataFrame(list(tiktok))
tiktok_df['social_platform'] = 'TikTok'

youtube = db['youtube'].find()
youtube_df = pd.DataFrame(list(youtube))
youtube_df['social_platform'] = 'YouTube'

# Check dataframe
instagram_df.head()

Unnamed: 0,_id,#,NAME,FOLLOWERS,ER,COUNTRY,TOPIC OF INFLUENCE,POTENTIAL REACH,social_platform
0,6615258d2104659b995ce9ff,1,Cristiano Ronaldo @cristiano,625.1M,0.01%,Portugal,Finance Soccer,187.5M,Instagram
1,6615258d2104659b995cea00,3,Selena Gomez @selenagomez,429.7M,1.09%,United States,Beauty and Self Care Entertainment and Music M...,128.9M,Instagram
2,6615258d2104659b995cea01,4,Kylie @kyliejenner,400.5M,-,United States,Beauty and Self Care Product Showcase Modeling...,120.1M,Instagram
3,6615258d2104659b995cea02,5,Dwayne Johnson @therock,397.8M,0.13%,United States,Entertainment and Music Actors Public Figure,119.3M,Instagram
4,6615258d2104659b995cea03,2,Leo Messi @leomessi,501.1M,0.01%,Argentina,Sports Soccer Activity General General Interest,150.3M,Instagram


In [27]:
# union dataframes
socials_df = pd.concat([instagram_df, threads_df, tiktok_df, youtube_df], ignore_index=True)

# check merge
socials_df.head()

Unnamed: 0,_id,#,NAME,FOLLOWERS,ER,COUNTRY,TOPIC OF INFLUENCE,POTENTIAL REACH,social_platform
395,661525a2ae9afb9f5986ba10,97,La Granja de Zenón @UCwpcLKMwiuPg4aqImpGk6Ew,32.3M,-,Argentina,,9.7M,YouTube
396,661525a2ae9afb9f5986ba11,95,Sony PAL @UCw7xjxzbMwgBSmbeYwqYRMg,32.6M,-,,,9.8M,YouTube
397,661525a2ae9afb9f5986ba12,98,Alfredo Larin @alfredolarin,32.3M,-,El Salvador,,9.7M,YouTube
398,661525a2ae9afb9f5986ba13,99,Mikecrack @UCqJ5zFEED1hWs0KNQCQuYdQ,32.2M,0.6%,Spain,,9.7M,YouTube
399,661525a2ae9afb9f5986ba14,100,Saregama Music @UC_A7K2dXFsTMAciGmnNxy-Q,32.1M,-,India,,9.6M,YouTube


In [None]:
# check merge
socials_df.tail()

In [33]:
# https://pandas.pydata.org/docs/reference/api/pandas.Series.str.replace.html
# regex_pat = r"(/[^0-9,.]+/g)"
# socials_df['FOLLOWERS'] = socials_df['FOLLOWERS'].str.replace(regex_pat, '', regex=True).astype(float)

# remove keep numbers only
# UP TO HERE - need to find correct regex pattern to get decimals
socials_df['FOLLOWERS'] = socials_df['FOLLOWERS'].str.extract('(\d+)').astype(float)
socials_df.head()

Unnamed: 0,_id,#,NAME,FOLLOWERS,ER,COUNTRY,TOPIC OF INFLUENCE,POTENTIAL REACH,social_platform
0,6615258d2104659b995ce9ff,1,Cristiano Ronaldo @cristiano,625.0,0.01%,Portugal,Finance Soccer,187.5M,Instagram
1,6615258d2104659b995cea00,3,Selena Gomez @selenagomez,429.0,1.09%,United States,Beauty and Self Care Entertainment and Music M...,128.9M,Instagram
2,6615258d2104659b995cea01,4,Kylie @kyliejenner,400.0,-,United States,Beauty and Self Care Product Showcase Modeling...,120.1M,Instagram
3,6615258d2104659b995cea02,5,Dwayne Johnson @therock,397.0,0.13%,United States,Entertainment and Music Actors Public Figure,119.3M,Instagram
4,6615258d2104659b995cea03,2,Leo Messi @leomessi,501.0,0.01%,Argentina,Sports Soccer Activity General General Interest,150.3M,Instagram


In [None]:
socials_df['POTENTIAL REACH'] = socials_df['POTENTIAL REACH'].str.replace('M', '').astype(float)
socials_df.head()