# Handle to Name

In [54]:
import pandas as pd
from collections import defaultdict

In [13]:
FILE_PATH = "/Users/Janiwo/Dropbox/Master thesis data"
MORIA_PATH = FILE_PATH + "/Event Dataframes/df_moria.csv"
USERS_PATH = FILE_PATH + "/df_users.csv"

In [14]:
# Read the tweets csv

df_tweets = pd.read_csv(MORIA_PATH,
                        converters={"hashtags": lambda x: x.strip("[]").replace("'","").split(", "),
                                    "mentions": lambda x: x.strip("[]").replace("'","").split(", "),
                                    "annotations": lambda x: x.strip("[]").replace("'","").split(", ")})

# Drop unnecessary index column
df_tweets.drop("Unnamed: 0", axis=1, inplace=True)

df_tweets.head()

Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,...,text_clean,year,calendar_week,year_month,year_calendar_week,refugee,migrant,immigrant,asylum_seeker,other
0,Twitter Web App,"Canada's immigrant population is 20%, USA is 1...",en,1267244723103690753,2020-06-01 00:01:00+00:00,442949745,1,1,9,1,...,"canada's immigrant population is 20%, usa is 1...",2020,22,2020_6,2020_22,False,False,True,False,False
1,Twitter Web App,Hi @EUHomeAffairs @Place_Beauvau @BMI_Bund @uk...,en,1267247183725621248,2020-06-01 00:10:47+00:00,211570886,0,0,0,0,...,hi euhomeaffairs place_beauvau bmi_bund ukhome...,2020,22,2020_6,2020_22,True,False,False,False,False
2,Twitter Web App,"#Greece Dozens of Asylum seekers, who face the...",en,1267251185838407681,2020-06-01 00:26:41+00:00,119888012,12,2,11,0,...,"greece dozens of asylum seekers, who face the ...",2020,22,2020_6,2020_22,False,False,False,False,False
3,Twitter Web App,"Hmmm? Maybe not, Spain is the COVID-19 hot spo...",en,1267260557213806599,2020-06-01 01:03:56+00:00,4839872717,0,0,0,0,...,"hmmm? maybe not, spain is the covid-19 hot spo...",2020,22,2020_6,2020_22,False,False,True,False,False
4,Twitter Web App,"Greece to evict over 10,000 refugees from shel...",en,1267264681108025346,2020-06-01 01:20:19+00:00,1171990967001526272,0,0,0,0,...,"greece to evict over 10,000 refugees from shel...",2020,22,2020_6,2020_22,True,False,False,False,False


In [15]:
# Read the users csv
df_users = pd.read_csv(USERS_PATH)

# Drop unnecessary index column
df_users.drop("Unnamed: 0", axis=1, inplace=True)

df_users.head()

Unnamed: 0,description,username,verified,name,created_at,id,url,public_metrics.followers_count,public_metrics.following_count,public_metrics.tweet_count,public_metrics.listed_count,location
0,PhD Candidate at Engineering,GaKahsay,False,Kahsay GA,2020-11-13T15:55:37.000Z,1327278886380515328,,817,1215,4427,0,
1,#STOPWARONTIGRAY,sayumek2020,False,Selam,2020-11-04T08:23:06.000Z,1323903491044188161,,43,156,3531,0,
2,ማርያም ፅዮን ምስ ወዳ ፅላል ትግራይ ትኩነልና \n#TigrayGenocid...,AbbyH06586436,False,Abby |ዓይነይ-ትግራይ እንዲኺ|,2020-11-04T23:24:06.000Z,1324130252008816640,,54,232,1129,1,New Zealand
3,,MahfuzHussien,False,Mahfuz Hussien,2013-01-22T22:05:12.000Z,1112761003,,151,788,1455,1,
4,Journalists Standing up for human rights acros...,jonewales,False,William_Sea,2020-01-02T19:56:07.000Z,1212824799107375105,,315,1913,2710,3,


In [36]:
# Create dict that maps usernames to actual names
mapping = dict(df_users[["username","name"]].values)
mapping = {f'@{key}': value for key, value in mapping.items()}

In [43]:
def resolve_username_to_name(text):
    new_text = text
    for word in text.split(" "):
        if word in mapping:
            new_text = new_text.replace(word,mapping[word])
    return new_text        

In [47]:
df_tweets["text"][2500]

'@Pog3ko1 @Phaedrus_L @Bella45078480 @shabir1441 @HenryJP5 In Greece 🇬🇷 if immigrants will do something like that they have to run 🏃\u200d♀️ out of the Country because they will find them dead 💀!!!!'

In [48]:
resolve_username_to_name(df_tweets["text"][2500])

'🤗Pog3ko Phaedrus L. @Bella45078480 @shabir1441 Don Henrique da Holanda In Greece 🇬🇷 if immigrants will do something like that they have to run 🏃\u200d♀️ out of the Country because they will find them dead 💀!!!!'

In [49]:
def get_entity_count(series):
    dict_count = defaultdict(int)
    
    for entities in series:
        for entity in entities:
            if len(entity) > 0:
                dict_count[entity] += 1
            
    return dict_count

In [51]:
df_tweets.columns

Index(['source', 'text', 'lang', 'id', 'created_at', 'author_id',
       'retweet_count', 'reply_count', 'like_count', 'quote_count', 'hashtags',
       'mentions', 'annotations', 'text_clean', 'year', 'calendar_week',
       'year_month', 'year_calendar_week', 'refugee', 'migrant', 'immigrant',
       'asylum_seeker', 'other'],
      dtype='object')

In [70]:
mention_count = get_entity_count(df_tweets["mentions"])
mention_count_sorted = sorted(mention_count.items(), key=lambda x: x[1], reverse=True)
mention_count_sorted[:25]

[('euhomeaffairs', 3248),
 ('bmi_bund', 2627),
 ('ukhomeoffice', 2582),
 ('ministeriejenv', 2507),
 ('interieur_gouv', 1911),
 ('refugees', 1706),
 ('unhcrgreece', 1555),
 ('primeministergr', 1072),
 ('eu_commission', 1006),
 ('ylvajohansson', 883),
 ('nmitarakis', 841),
 ('easo', 771),
 ('kmitsotakis', 703),
 ('justitiedep', 668),
 ('refugees_eu', 652),
 ('unmigration', 650),
 ('vonderleyen', 574),
 ('pritipatel', 553),
 ('youtube', 531),
 ('realdonaldtrump', 509),
 ('frontex', 471),
 ('hrw', 460),
 ('nytimes', 454),
 ('unhcr_de', 454),
 ('iomgreece', 421)]

In [71]:
clean_mentions = mention_count.copy()
for mention in mention_count:
    if f"@{mention}" not in mapping:
        del clean_mentions[mention]

In [72]:
clean_mentions_sorted = sorted(clean_mentions.items(), key=lambda x: x[1], reverse=True)
clean_mentions_sorted[:25]

[('ukhomeoffice', 2582),
 ('nmitarakis', 841),
 ('vonderleyen', 574),
 ('pritipatel', 553),
 ('hrw', 460),
 ('nytimes', 454),
 ('alarm_phone', 353),
 ('dwnews', 215),
 ('migrationgovgr', 211),
 ('eucopresident', 196),
 ('camanpour', 190),
 ('guardian', 189),
 ('g_christides', 189),
 ('greekcitytimes', 166),
 ('thedukeoriginal', 166),
 ('amnesty', 158),
 ('ekathimerini', 118),
 ('sebastiankurz', 114),
 ('linnyprivate', 110),
 ('f_grillmeier', 108),
 ('llosa_gonzalo', 108),
 ('chooselove', 106),
 ('teammareliberum', 98),
 ('rspaegean', 97),
 ('serpetina1', 93)]