# File Consolidation

In [3]:
# Import libraries

import pandas as pd
import numpy as np
import os
import glob
import json

from tqdm import tqdm

from collections import defaultdict

In [10]:
# Navigate to file folder and retrieve relevant files
all_files = glob.glob(r"C:\Users\jawo19ad\Dropbox (CBS)\Master thesis data\Twitter Fetch\*.txt")

In [11]:
def load_full_json(file):
    
    # Create empty dictionary where values are stored
    json_complete = {"data":list(),"users":list()}
    
    # Open the json file
    with open(file, "r") as json_file:
        
        # Iterate through json file and store information at appropriate dict key
        for line in json_file:
            data = json.loads(line)
            json_complete["data"].extend(data["data"])
            json_complete["users"].extend(data["includes"]["users"])
    
    return json_complete

In [13]:
json_complete = {"data":list(),"users":list()}
tweets_complete = {"data":list()}
users_complete = {"users":list()}


for file in tqdm(all_files):
    json_file = load_full_json(file)
    #print(len(json_file["data"]), len(json_file["users"]))
    tweets_complete["data"].extend(json_file["data"])
    users_complete["users"].extend(json_file["users"])

100%|██████████| 19/19 [07:05<00:00, 22.38s/it]


In [15]:
len(tweets_complete["data"])

6813504

In [16]:
len(users_complete["users"])

6085289

In [17]:
hashtag_count = get_hashtag_counts(tweets_complete)
sort_hash_count = sorted(hashtag_count.items(), key=lambda x: x[1], reverse=True)

In [18]:
sort_hash_count[:20]

[('TigrayGenocide', 71317),
 ('Tigray', 60549),
 ('refugees', 45173),
 ('COVID19', 45094),
 ('S386', 35644),
 ('StopWarOnTigray', 33039),
 ('Refugees', 29517),
 ('migrants', 26884),
 ('Ethiopia', 25452),
 ('Iraqi_Refugees_Look_For_Homeland', 20851),
 ('immigrants', 20432),
 ('DV2021', 20023),
 ('DV2020', 18555),
 ('refugee', 18541),
 ('WorldRefugeeDay', 17272),
 ('UnblockS386', 17186),
 ('RefugeesWelcome', 13484),
 ('PassS386', 13478),
 ('Eritrea', 13187),
 ('Immigration', 13007)]

In [21]:
# Store tweets file
with open(r'C:\Users\jawo19ad\Dropbox (CBS)\Master thesis data\all_tweets_072020_022021.json', 'w') as f:
    json.dump(tweets_complete, f)

In [22]:
# Store users file
with open(r'C:\Users\jawo19ad\Dropbox (CBS)\Master thesis data\all_users_072020_022021.json', 'w') as f:
    json.dump(users_complete, f)

# Creation of df_user

In [6]:
# Load the json file containing the users
with open(r'C:\Users\jawo19ad\Dropbox (CBS)\Master thesis data\all_users_072020_022021.json', 'r') as f:
    data = json.load(f)

In [9]:
# Convert json file into pandas dataframe
df_users = pd.json_normalize(data["users"])
df_users.head()

Unnamed: 0,description,username,verified,name,created_at,id,url,public_metrics.followers_count,public_metrics.following_count,public_metrics.tweet_count,public_metrics.listed_count,entities.description.hashtags,location,entities.url.urls,entities.description.urls,entities.description.mentions,entities.description.cashtags,withheld.country_codes
0,PhD Candidate at Engineering,GaKahsay,False,Kahsay GA,2020-11-13T15:55:37.000Z,1327278886380515328,,817,1215,4427,0,,,,,,,
1,#STOPWARONTIGRAY,sayumek2020,False,Selam,2020-11-04T08:23:06.000Z,1323903491044188161,,43,156,3531,0,"[{'start': 0, 'end': 16, 'tag': 'STOPWARONTIGR...",,,,,,
2,ማርያም ፅዮን ምስ ወዳ ፅላል ትግራይ ትኩነልና \n#TigrayGenocid...,AbbyH06586436,False,Abby |ዓይነይ-ትግራይ እንዲኺ|,2020-11-04T23:24:06.000Z,1324130252008816640,,54,232,1129,1,"[{'start': 31, 'end': 46, 'tag': 'TigrayGenoci...",New Zealand,,,,,
3,,MahfuzHussien,False,Mahfuz Hussien,2013-01-22T22:05:12.000Z,1112761003,,151,788,1455,1,,,,,,,
4,Journalists Standing up for human rights acros...,jonewales,False,William_Sea,2020-01-02T19:56:07.000Z,1212824799107375105,,315,1913,2710,3,,,,,,,


In [10]:
df_users.shape

(6085289, 18)

In [12]:
# Remove duplicate users
df_users_no_dups = df_users.drop_duplicates(subset="id")

In [13]:
df_users_no_dups.shape

(1928500, 18)

In [35]:
# Drop irrelevant columns
df_users_no_dups.drop(['entities.description.hashtags','entities.url.urls', 'entities.description.urls',
                       'entities.description.mentions', 'entities.description.cashtags', 'withheld.country_codes'],
                       axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [42]:
# Store file in csv format
df_users_no_dups.to_csv(r"C:\Users\jawo19ad\Dropbox (CBS)\Master thesis data\df_users.csv")

# Creation of df_tweets

In [16]:
# Load the json file containing the tweets
with open(r'C:\Users\jawo19ad\Dropbox (CBS)\Master thesis data\all_tweets_072020_022021.json', 'r') as f:
    data = json.load(f)

In [53]:
# Convert json file into pandas dataframe
df_tweets = pd.json_normalize(data["data"])
df_tweets.head()

Unnamed: 0,source,text,lang,id,created_at,author_id,entities.mentions,entities.hashtags,entities.urls,public_metrics.retweet_count,...,public_metrics.quote_count,entities.annotations,in_reply_to_user_id,referenced_tweets,geo.place_id,geo.coordinates.type,geo.coordinates.coordinates,withheld.copyright,withheld.country_codes,entities.cashtags
0,Twitter for Android,There are too many reliable reports and first-...,en,1350390669043499013,2021-01-16T10:33:19.000Z,1327278886380515328,"[{'start': 178, 'end': 187, 'username': 'JoeBi...","[{'start': 164, 'end': 172, 'tag': 'Eritrea'},...","[{'start': 250, 'end': 273, 'url': 'https://t....",1,...,0,,,,,,,,,
1,Twitter Web App,"Despite repeated requests, @Refugees and other...",en,1350390657576300544,2021-01-16T10:33:16.000Z,1323903491044188161,"[{'start': 27, 'end': 36, 'username': 'Refugee...","[{'start': 166, 'end': 173, 'tag': 'Tigray'}, ...","[{'start': 252, 'end': 275, 'url': 'https://t....",0,...,0,"[{'start': 101, 'end': 108, 'probability': 0.3...",,,,,,,,
2,Twitter for Android,"“When the air bombing and the attacks began, I...",en,1350390643986599937,2021-01-16T10:33:13.000Z,1324130252008816640,"[{'start': 190, 'end': 199, 'username': 'JoeBi...","[{'start': 231, 'end': 247, 'tag': 'BidenTakeA...","[{'start': 248, 'end': 271, 'url': 'https://t....",0,...,0,,,,,,,,,
3,Twitter Web App,"""ongoing insecurity &amp; allegations of grave...",en,1350390618695020546,2021-01-16T10:33:07.000Z,1112761003,"[{'start': 144, 'end': 147, 'username': 'UN'},...","[{'start': 200, 'end': 215, 'tag': 'TigrayGeno...","[{'start': 246, 'end': 269, 'url': 'https://t....",0,...,0,"[{'start': 127, 'end': 133, 'probability': 0.8...",,,,,,,,
4,Twitter for iPhone,"MOTHER \n\nMother, I do not cry who cries, \nm...",en,1350390607928295424,2021-01-16T10:33:04.000Z,1212824799107375105,,,,0,...,0,,1.212824799107375e+18,"[{'type': 'replied_to', 'id': '135038931103271...",,,,,,


In [83]:
df_hashtag_mentions = pd.DataFrame(columns=["id","hashtags","mentions"])
for tweet in tqdm(data["data"]):
    
    tweet_id = tweet["id"]
    
    try:
        hashtags = [hashtag["tag"] for hashtag in tweet["entities"]["hashtags"]]
    except KeyError:
        hashtags = None
    
    try:
        mentions = [mention["username"] for mention in tweet["entities"]["mentions"]]
    except KeyError:
        mentions = None
    
    hash_men = pd.DataFrame([[tweet_id,hashtags,mentions]], columns=["id","hashtags","mentions"])
    df_hashtag_mentions = df_hashtag_mentions.append(hash_men, ignore_index=True)

  1%|          | 48652/6813504 [03:50<8:53:04, 211.50it/s] 


KeyboardInterrupt: 

In [81]:
df_hashtag_mentions

Unnamed: 0,id,hashtags,mentions
0,1350390669043499013,"[Eritrea, BidenTakeAction, StopWarOnTigray, Ti...",[JoeBiden]


In [71]:
data["data"][10]["entities"]

{'annotations': [{'start': 26,
   'end': 35,
   'probability': 0.9377,
   'type': 'Place',
   'normalized_text': 'West Papua'}],
 'urls': [{'start': 125,
   'end': 148,
   'url': 'https://t.co/m5bVAnzhdG',
   'expanded_url': 'https://www.lithgowmercury.com.au/story/5805532/west-papua-refugees-share-their-stories-at-lithgow-forum/',
   'display_url': 'lithgowmercury.com.au/story/5805532/…'}]}

In [40]:
df_tweets.drop(['entities.mentions', 'entities.hashtags', 'entities.urls','entities.annotations',
                'in_reply_to_user_id', 'referenced_tweets', 'geo.place_id', 'geo.coordinates.type',
                'geo.coordinates.coordinates', 'withheld.copyright', 'withheld.country_codes', 'entities.cashtags'],
                axis=1, inplace=True)

In [43]:
df_tweets.to_csv(r"C:\Users\jawo19ad\Dropbox (CBS)\Master thesis data\df_tweets.csv")

In [48]:
data["data"][0]["entities"]

{'mentions': [{'start': 178, 'end': 187, 'username': 'JoeBiden'}],
 'hashtags': [{'start': 164, 'end': 172, 'tag': 'Eritrea'},
  {'start': 199, 'end': 215, 'tag': 'BidenTakeAction'},
  {'start': 217, 'end': 233, 'tag': 'StopWarOnTigray'},
  {'start': 234, 'end': 249, 'tag': 'TigrayGenocide'}],
 'urls': [{'start': 250,
   'end': 273,
   'url': 'https://t.co/mVNK7zEicI',
   'expanded_url': 'https://www.reuters.com/article/us-ethiopia-conflict-un-idUSKBN29J2N6?taid=6000c010933c880001bc1036&utm_campaign=trueAnthem:+Trending+Content&utm_medium=trueAnthem&utm_source=twitter',
   'display_url': 'reuters.com/article/us-eth…'}]}

In [52]:
pd.json_normalize(data["data"][0]["entities"])

Unnamed: 0,mentions,hashtags,urls
0,"[{'start': 178, 'end': 187, 'username': 'JoeBi...","[{'start': 164, 'end': 172, 'tag': 'Eritrea'},...","[{'start': 250, 'end': 273, 'url': 'https://t...."


In [50]:
?pd.json_normalize