In [1]:
import pandas as pd
import reverse_geocoder
import country_converter as coco

In [32]:
# read full dataset from HW2
df = pd.read_csv('final.tsv', sep='\t')

In [33]:
df.columns

Index(['Story Primary ID', 'Story ID', 'User Primary ID', 'User ID', 'Gender',
       'Age', 'Title', 'Narrative', 'Media', 'Account Created Date',
       'Interest', 'ifHate', 'ifSarcasm', 'Film', 'Sport', 'Postive Words 1',
       'Postive Words 2', 'Postive Words 3', 'Europe Holiday',
       'Americas Holiday', 'Asia Holiday', 'Africa Holiday', 'Oceania Holiday',
       'Public Holiday Or Not', 'Covid Cases', 'Covid Deaths', 'Covid Tests',
       'Covid Vaccinations', 'Image Caption', 'Detected Objects',
       'tika_language', 'google_language', 'rtg_translate', 'geo_name', 'long',
       'lat', 'Toxicity', 'Severe_Toxicity', 'Obscenity', 'Identity_Attack',
       'Insult', 'Threat', 'Sexual_Explicit'],
      dtype='object')

In [34]:
# update datatype of columns that store numerical scores
df.Toxicity = df.Toxicity.astype(float).fillna(0.0)
df.Severe_Toxicity = df.Severe_Toxicity.astype(float).fillna(0.0)
df.Obscenity = df.Obscenity.astype(float).fillna(0.0)
df.Identity_Attack = df.Identity_Attack.astype(float).fillna(0.0)
df.Insult = df.Insult.astype(float).fillna(0.0)
df.Threat = df.Threat.astype(float).fillna(0.0)
df.Sexual_Explicit = df.Sexual_Explicit.astype(float).fillna(0.0)

## Get Toxic Posts

In [10]:
# any post with a score >0.5 from one of the fields below is considered toxic
toxic = df[(df['Toxicity']>=0.5) | (df['Severe_Toxicity']>=0.5) | (df['Obscenity']>=0.5) 
     | (df['Identity_Attack']>=0.5) | (df['Insult']>=0.5) | (df['Threat']>=0.5) 
     | (df['Sexual_Explicit']>=0.5)]
toxic

Unnamed: 0,Story Primary ID,Story ID,User Primary ID,User ID,Gender,Age,Title,Narrative,Media,Account Created Date,...,geo_name,long,lat,Toxicity,Severe_Toxicity,Obscenity,Identity_Attack,Insult,Threat,Sexual_Explicit
162,156419,STY1666076574,74779,USR1660640666,,28,ડોનેટ લાઈફની નવી ઓફીસ તથા અંગદાતાઓના સન્માન મા...,"<p dir=""ltr"">&#2721;&#2763;&#2728;&#2759;&#271...",https://image.pixstory.com/Pixstory-image-1666...,2022/8/16 14:34,...,,,,0.580837,0.001450,0.084721,0.086203,0.038598,0.001605,0.008158
224,156357,STY1666034269,11859,USR1638425715,male,21,Baby elephant found dead near Kaziranga Nation...,"<p dir=""ltr"">Baby elephant found dead near Kaz...",https://image.pixstory.com/Pixstory-image-1666...,2021/12/2 11:45,...,,,,0.795028,0.004406,0.660693,0.004020,0.080264,0.000550,0.517272
297,156284,STY1666022192,11859,USR1638425715,male,21,BSF shoot suspected drone near Amritsar: ভারত-...,"<p dir=""ltr"">BSF shoot suspected drone near Am...",https://image.pixstory.com/Pixstory-image-1666...,2021/12/2 11:45,...,['Amritsar'],['74.87534'],['31.62234'],0.603689,0.000831,0.059121,0.004280,0.010228,0.219776,0.001175
318,156263,STY1666019632,105365,USR1664352833,,19,Dose of daily affirmations,"<p dir=""ltr"">&#128073;&#127995;I will have a g...",https://image.pixstory.com/Pixstory-image-1666...,2022/9/28 13:43,...,,,,0.606454,0.000008,0.001864,0.000203,0.514379,0.000056,0.000077
377,156204,STY1666003356,119640,USR1666003189,,27,In this world full of hypocrites the one who r...,"<p dir=""ltr""><i>In this world full of hypocrit...",https://image.pixstory.com/Pixstory-image-1666...,2022/10/17 16:09,...,,,,0.987004,0.001147,0.011152,0.162894,0.957025,0.015613,0.001378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94207,70138,STY1642252916,21803,USR1642252538,,21,Maya Angelou,"A legendary author, poet, activist and all aro...",https://image.pixstory.com/Pixstory-image-1642...,2022/1/15 18:45,...,,,,0.990595,0.151817,0.967396,0.043235,0.860015,0.046330,0.687782
94419,69903,STY1642232050,21645,USR1642231927,,20,Celebration 2021,"Although the year was shit, but we had a good ...",https://image.pixstory.com/Pixstory-image-1642...,2022/1/15 13:02,...,,,,0.911533,0.004486,0.892368,0.002455,0.120261,0.000744,0.002644
94718,69589,STY1642173804,10534,USR1636448509,male,45,"Covid, vaccini in crescita in Italia",Numeri in crescita per la campagna vaccinale n...,https://image.pixstory.com/Pixstory-image-1642...,2021/11/9 14:31,...,,,,0.981806,0.011953,0.928912,0.008366,0.829574,0.000293,0.005954
94733,69573,STY1642172922,21403,USR1642172803,,23,Pizzaaa,Pizaaaaaaa,https://image.pixstory.com/Pixstory-image-1642...,2022/1/14 20:36,...,,,,0.981393,0.085253,0.986509,0.008170,0.336215,0.001402,0.052297


In [11]:
# rename the column
toxic = toxic.rename(columns={'Account Created Date':'Date'})
toxic

Unnamed: 0,Story Primary ID,Story ID,User Primary ID,User ID,Gender,Age,Title,Narrative,Media,Date,...,geo_name,long,lat,Toxicity,Severe_Toxicity,Obscenity,Identity_Attack,Insult,Threat,Sexual_Explicit
162,156419,STY1666076574,74779,USR1660640666,,28,ડોનેટ લાઈફની નવી ઓફીસ તથા અંગદાતાઓના સન્માન મા...,"<p dir=""ltr"">&#2721;&#2763;&#2728;&#2759;&#271...",https://image.pixstory.com/Pixstory-image-1666...,2022/8/16 14:34,...,,,,0.580837,0.001450,0.084721,0.086203,0.038598,0.001605,0.008158
224,156357,STY1666034269,11859,USR1638425715,male,21,Baby elephant found dead near Kaziranga Nation...,"<p dir=""ltr"">Baby elephant found dead near Kaz...",https://image.pixstory.com/Pixstory-image-1666...,2021/12/2 11:45,...,,,,0.795028,0.004406,0.660693,0.004020,0.080264,0.000550,0.517272
297,156284,STY1666022192,11859,USR1638425715,male,21,BSF shoot suspected drone near Amritsar: ভারত-...,"<p dir=""ltr"">BSF shoot suspected drone near Am...",https://image.pixstory.com/Pixstory-image-1666...,2021/12/2 11:45,...,['Amritsar'],['74.87534'],['31.62234'],0.603689,0.000831,0.059121,0.004280,0.010228,0.219776,0.001175
318,156263,STY1666019632,105365,USR1664352833,,19,Dose of daily affirmations,"<p dir=""ltr"">&#128073;&#127995;I will have a g...",https://image.pixstory.com/Pixstory-image-1666...,2022/9/28 13:43,...,,,,0.606454,0.000008,0.001864,0.000203,0.514379,0.000056,0.000077
377,156204,STY1666003356,119640,USR1666003189,,27,In this world full of hypocrites the one who r...,"<p dir=""ltr""><i>In this world full of hypocrit...",https://image.pixstory.com/Pixstory-image-1666...,2022/10/17 16:09,...,,,,0.987004,0.001147,0.011152,0.162894,0.957025,0.015613,0.001378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94207,70138,STY1642252916,21803,USR1642252538,,21,Maya Angelou,"A legendary author, poet, activist and all aro...",https://image.pixstory.com/Pixstory-image-1642...,2022/1/15 18:45,...,,,,0.990595,0.151817,0.967396,0.043235,0.860015,0.046330,0.687782
94419,69903,STY1642232050,21645,USR1642231927,,20,Celebration 2021,"Although the year was shit, but we had a good ...",https://image.pixstory.com/Pixstory-image-1642...,2022/1/15 13:02,...,,,,0.911533,0.004486,0.892368,0.002455,0.120261,0.000744,0.002644
94718,69589,STY1642173804,10534,USR1636448509,male,45,"Covid, vaccini in crescita in Italia",Numeri in crescita per la campagna vaccinale n...,https://image.pixstory.com/Pixstory-image-1642...,2021/11/9 14:31,...,,,,0.981806,0.011953,0.928912,0.008366,0.829574,0.000293,0.005954
94733,69573,STY1642172922,21403,USR1642172803,,23,Pizzaaa,Pizaaaaaaa,https://image.pixstory.com/Pixstory-image-1642...,2022/1/14 20:36,...,,,,0.981393,0.085253,0.986509,0.008170,0.336215,0.001402,0.052297


### Get data for timeseries

In [None]:
# save data for creating the timeseries visualization
toxic[['User ID','Date']].to_csv('toxic_posts.csv', index=False)

## Countries mentioned in Toxic Posts

In [12]:
# futher process the toxic posts, get the ones that mentioned at least a country
locs = toxic.loc[df['geo_name'].notnull()][['lat','long']]
locs

Unnamed: 0,lat,long
297,['31.62234'],['74.87534']
2227,['39.76'],['-98.5']
10266,['38.25031'],['-92.50046']
11613,['-6.0'],['72.0']
16237,"['28.65195', '-6.0']","['77.23149', '72.0']"
16351,['31.5'],['34.75']
16492,['-6.0'],['72.0']
37460,['-2.5'],['23.5']
37461,['-2.5'],['23.5']
39342,['37.17344'],['78.41657']


In [13]:
# get coordinate pair and number of appearance, save as key-value pair
d = {}
for index, row in locs.iterrows():
    lat = row['lat'].strip("['").rstrip("]'")
    long = row['long'].strip("['").rstrip("]'")    
    if ',' in lat:
        lats = lat.split(',')
        longs = long.split(',')
        for i in range(len(lats)):
            la = lats[i].rstrip("'").strip("'")
            lo = longs[i].rstrip("'").strip("'")
            if i!=0:
                la = la.strip(" '")
                lo = lo.strip(" '")
            key = (la, lo)
            if not key in d:
                d[key] = 1
            else:
                d[key] += 1
    else:
        key = (lat, long)
        if not key in d:
            d[key] = 1
        else:
            d[key] += 1
            
# get country code for each coordinate pair, save country code and count in dataframe
locs_count = pd.DataFrame(columns=['country','count'])
for key, value in d.items():
    coord = reverse_geocoder.search(key)
    locs_count.loc[len(locs_count)] = [coord[0]['cc'], value]

# coonvert two-character code to three-character code
for index, row in locs_count.iterrows():
    locs_count.loc[index, 'country'] = coco.convert(names=[row['country']], to='ISO3')
    


In [23]:
# multiple coordinate pairs can be map to the same country, so there is a need to group again
locs_count = locs_count.groupby('country').sum()

In [25]:
# get total number of occurrence
locs_count['count'].sum()

58

In [28]:
# save dataframe as json file
locs_count.reset_index().to_json('toxic.json',orient='records')

## Get numbder of occurrence of each country name in all posts

In [21]:
# get all posts that mentioned country/countries
full = df[df['geo_name'].notnull()][['lat','long']]

In [None]:
# get coordinate pair and number of appearance, save as key-value pair
d = {}
for index, row in full.iterrows():
    lat = row['lat'].strip("['").rstrip("]'")
    long = row['long'].strip("['").rstrip("]'")    
    if ',' in lat:
        lats = lat.split(',')
        longs = long.split(',')
        for i in range(len(lats)):
            la = lats[i].rstrip("'").strip("'")
            lo = longs[i].rstrip("'").strip("'")
            if i!=0:
                la = la.strip(" '")
                lo = lo.strip(" '")
            key = (la, lo)
            if not key in d:
                d[key] = 1
            else:
                d[key] += 1
    else:
        key = (lat, long)
        if not key in d:
            d[key] = 1
        else:
            d[key] += 1
            
# get country code for each coordinate pair, save country code and count in dataframe
full_count = pd.DataFrame(columns=['country','count'])
for key, value in d.items():
    coord = reverse_geocoder.search(key)
    full_count.loc[len(full_count)] = [coord[0]['cc'], value]

# coonvert two-character code to three-character code
for index, row in full_count.iterrows():
    full_count.loc[index, 'country'] = coco.convert(names=[row['country']], to='ISO3')

In [None]:
# group again as needed
full_count =full_count.groupby('country').sum()

In [13]:
full_count['count'].sum()

43101

In [19]:
# save dataframe as json file
full_count.reset_index().to_json('allcodes.json',orient='records')

## Countries mentioned in Sarcastic Posts

In [5]:
# get posts that are sarcastic and mentioned country/countries
sarcasm = df[df['geo_name'].notnull()][df['ifSarcasm']==True][['lat','long']]
sarcasm

  sarcasm = df[df['geo_name'].notnull()][df['ifSarcasm']==True][['lat','long']]


Unnamed: 0,lat,long
1076,['38.70167'],['-77.47766']
2069,"['-20.0', '-33.0', '-25.0']","['145.0', '146.0', '135.0']"
5984,['7.08333'],['124.26667']
6195,['47.23565'],['-122.5504']
6588,['39.76'],['-98.5']
...,...,...
93491,['36.76694'],['-98.86512']
94203,['46.0'],['2.0']
94407,"['28.65195', '27.3796', '-6.0']","['77.23149', '52.7377', '72.0']"
94435,['28.65195'],['77.23149']


In [6]:
# get coordinate pair and number of appearance, save as key-value pair
d = {}
for index, row in sarcasm.iterrows():
    lat = row['lat'].strip("['").rstrip("]'")
    long = row['long'].strip("['").rstrip("]'")    
    if ',' in lat:
        lats = lat.split(',')
        longs = long.split(',')
        for i in range(len(lats)):
            la = lats[i].rstrip("'").strip("'")
            lo = longs[i].rstrip("'").strip("'")
            if i!=0:
                la = la.strip(" '")
                lo = lo.strip(" '")
            key = (la, lo)
            if not key in d:
                d[key] = 1
            else:
                d[key] += 1
    else:
        key = (lat, long)
        if not key in d:
            d[key] = 1
        else:
            d[key] += 1
            
# get country code for each coordinate pair, save country code and count in dataframe
sarcasm_count = pd.DataFrame(columns=['country','count'])
for key, value in d.items():
    coord = reverse_geocoder.search(key)
    sarcasm_count.loc[len(sarcasm_count)] = [coord[0]['cc'], value]

# coonvert two-character code to three-character code
for index, row in sarcasm_count.iterrows():
    sarcasm_count.loc[index, 'country'] = coco.convert(names=[row['country']], to='ISO3')

Loading formatted geocoded file...


Unnamed: 0,country,count
0,USA,1
1,AUS,1
2,AUS,1
3,AUS,1
4,PHL,1
...,...,...
78,ESP,1
79,CHL,2
80,USA,1
81,FRA,1


In [None]:
# group again
sarcasm_count = sarcasm_count.groupby('country').sum()

In [9]:
sarcasm_count['count'].sum()

134

In [21]:
# save dataframe as json file
sarcasm_count.reset_index().to_json('sarcasm.json',orient='records')

## Countries mentioned in HateSpeech

In [35]:
# get posts that are hate speech and targeting country/countries
hate = df[df['geo_name'].notnull()][df['ifHate']==True][['lat','long']]
hate

  hate = df[df['geo_name'].notnull()][df['ifHate']==True][['lat','long']]


Unnamed: 0,lat,long
215,"['-6.0', '18.3419', '-10.0', '-30.0', '10.0']","['72.0', '-64.9307', '-55.0', '-71.0', '8.0']"
385,"['47.50012', '22.3384', '31.09502']","['-120.50147', '91.83168', '-85.83898']"
435,"['9.0', '15.0']","['39.5', '39.0']"
506,['53.48097'],['-2.24555']
767,"['60.0', '25.5', '-10.0', '46.0', '47.00016']","['100.0', '51.25', '-55.0', '2.0', '8.01427']"
...,...,...
92188,['28.65195'],['77.23149']
92352,['53.41667'],['-2.25']
93629,"['35.0', '26.0112']","['105.0', '-80.14949']"
94502,['38.58157'],['-121.4944']


In [36]:
# get coordinate pair and number of appearance, save as key-value pair
d = {}
for index, row in hate.iterrows():
    lat = row['lat'].strip("['").rstrip("]'")
    long = row['long'].strip("['").rstrip("]'")    
    if ',' in lat:
        lats = lat.split(',')
        longs = long.split(',')
        for i in range(len(lats)):
            la = lats[i].rstrip("'").strip("'")
            lo = longs[i].rstrip("'").strip("'")
            if i!=0:
                la = la.strip(" '")
                lo = lo.strip(" '")
            key = (la, lo)
            if not key in d:
                d[key] = 1
            else:
                d[key] += 1
    else:
        key = (lat, long)
        if not key in d:
            d[key] = 1
        else:
            d[key] += 1
            
# get country code for each coordinate pair, save country code and count in dataframe
hate_count = pd.DataFrame(columns=['country','count'])
for key, value in d.items():
    coord = reverse_geocoder.search(key)
    hate_count.loc[len(hate_count)] = [coord[0]['cc'], value]

# coonvert two-character code to three-character code
for index, row in hate_count.iterrows():
    hate_count.loc[index, 'country'] = coco.convert(names=[row['country']], to='ISO3')

Loading formatted geocoded file...


In [37]:
# group again
hate_count = hate_count.groupby('country').sum()

In [38]:
hate_count['count'].sum()

1840

In [39]:
# save dataframe as json file
hate_count.reset_index().to_json('hate.json',orient='records')