#Getting started

In [None]:
# Getting started
# Importing required libraries for the project
import numpy as np # for scientific computing
import pandas as pd # for data anaysis
import matplotlib # for visualization
import seaborn as sns # for visualization
import gdown
import os
import math

# Set pandas display
pd.options.display.max_columns = 50
pd.options.display.max_rows = 20

from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
# Import and link your google drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
# To access it from your computer, you'll have to put the correct path
# All the hydrated tweets
tw = pd.read_csv("/content/drive/MyDrive/UCL/Y2/QM2/QM2/15m_hyd_tweets.csv")
print(tw.shape)

(12205281, 35)


In [None]:
tw.tail()

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,place,possibly_sensitive,quote_id,retweet_count,retweet_id,retweet_screen_name,source,text,tweet_url,user_created_at,user_id,user_default_profile_image,user_description,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_time_zone,user_urls,user_verified
12205276,,Sat Mar 14 14:33:02 +0000 2020,,,https://twitter.com/QHaRi/status/1238247431801...,2,1238835534916698118,,,,en,,False,1.238247e+18,0,,,"<a href=""http://twitter.com/download/iphone"" r...",read nibbas https://t.co/ZopkCsoiMi,https://twitter.com/___Ivxn/status/12388355349...,Mon Apr 18 23:26:06 +0000 2016,722204589361995776,False,,1057,509,302,0,,Iván,___Ivxn,1998,,,False
12205277,,Wed Jan 27 15:29:26 +0000 2021,Melbourne Transit PTV,,https://www.heraldsun.com.au/coronavirus/15min...,0,1354451459081965577,,,,en,,False,,0,,,"<a href=""https://ifttt.com"" rel=""nofollow"">IFT...",Coronavirus: 15-minute immunity test could be ...,https://twitter.com/MyTransit_MEL/status/13544...,Tue Oct 29 06:08:36 +0000 2019,1189061368646950912,False,Latest Melbourne #Transportation news covering...,78,80,175,2,"Melbourne, Australia",MyTransit Melbourne,MyTransit_MEL,7537,,,False
12205278,,Fri Mar 27 03:11:48 +0000 2020,,https://twitter.com/gail_biggins/status/124337...,,0,1243375138629767168,FBIMiamiFL,,347505959.0,en,,False,,0,,,"<a href=""http://twitter.com/download/android"" ...",@FBIMiamiFL @DeptofDefense Well Dallas .... Te...,https://twitter.com/gail_biggins/status/124337...,Sun Apr 07 02:23:02 +0000 2019,1114715173275615233,False,X,1643,47,193,0,,BIGGSX3,gail_biggins,17378,,,False
12205279,,Fri Feb 05 18:32:11 +0000 2021,TransitIsEssential,,,1,1357758940390555649,,,,en,,,,1,,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",".@RepPeterDeFazio, @RepSamGraves, @SenSherrodB...",https://twitter.com/ldellapiana/status/1357758...,Fri Sep 05 21:39:40 +0000 2008,16149944,False,"Mvmt strategist, educator & writer. @ForAJustS...",6204,2283,3217,97,New York City,Libero Della Piana,ldellapiana,12090,,https://www.liberodellapiana.com/,False
12205280,,Tue Mar 03 14:19:06 +0000 2020,SmartNews,,https://www.businessinsider.com/trump-administ...,0,1234845760182726656,,,,en,,False,,0,,,"<a href=""http://twitter.com/download/iphone"" r...",The Trump administration says Medicare and Med...,https://twitter.com/Cathode2/status/1234845760...,Wed Aug 03 17:31:53 +0000 2011,347974196,True,,23,1,13,0,,Carl Jones,Cathode2,4835,,,False


#Getting the data ready for analysis



> Getting to the essential



In [None]:
# Only keep the relevant information ["id", "user_location","created_at"] and only keep the row that have location.
tw = tw[["id", "user_location", "created_at"]]
tw.dropna(subset=["user_location"], inplace=True)
print(tw.shape)

(8947664, 3)



> Formalizing dates

In [None]:
# Formalize dates to 'Month Year'
import datetime
tw["created_at"] = tw["created_at"].apply(lambda x: datetime.datetime.strptime(x, "%a %b %d %H:%M:%S %z %Y").strftime("%b %Y"))

#BERT label

In [None]:
bert = pd.read_csv("/content/drive/MyDrive/UCL/Y2/QM2/QM2/asonam_release_all_tweets.csv")
print(bert.shape)

(206348565, 2)


In [None]:
bert = bert.rename(columns={"Tweet ID": "id"})

In [None]:
twbert = pd.merge(tw, bert, on='id', how='left')

The BERT label can be equal to 0 which corrrespond to neutral speech, or equal to 1 which correspond to counter speech or "anti-hate speech", or equal to 2 which correspond to hate speech. It is because we took our data from a paper that has interest in the effect of counter speech, however, we do not. Let's get rid of this notation.

In [None]:
def change_bert(label):
    if label == 0:
        return label
    elif label == 1:
        label = 0
        return label
    elif label == 2:
        label = 1
        return label


From now on, and thanks to the change_bert function, 0 will correspond to neutral speech and 1 will correspond to hate speech.

In [None]:
twbert['BERT_label'] = twbert.BERT_label.progress_apply(change_bert)

  0%|          | 0/8947664 [00:00<?, ?it/s]

In [None]:
twbert[twbert.BERT_label == 1].shape

(39029, 4)

We have approximately 39,000 tweets labelled as hateful in our dataset.

#Locations



> Fromalizing locations



There is three type of location that we can find in our dataset:
 
1.   Correct and fairly formalized location e.g. "Washington, DC"
2.   Partially correct and/or unformalized location e.g. "Nashville, Tenessee"
3.   Locations that cannot be exploited e.g. "In my dreams"

The second type of location could be retrieve via geocoding, however, geocoding is extremely slow (less than 4 iterations/second), thus it is not relevant for with our amount of data.

In [None]:
twbert["user_location"] = twbert["user_location"].apply(lambda x:str(x))

We could use the formalized locations only but this drastically reduce the size of the dataset and it could potentially lead to a lack of data, especially too little hate tweets to conduct analysis.

Here we are assessing the 200 most common locations. Twitter users often give their location on their own, which makes geospatial analysis really hard since the locations are given in various form and are not formalized at all.

In [None]:
twbert["user_location"].value_counts()[:200]

United States               175863
London, England              94235
London                       88018
India                        86455
USA                          76117
                             ...  
San Jose, CA                  4649
Delhi                         4644
Perth, Western Australia      4633
North East, England           4628
Tucson, AZ                    4618
Name: user_location, Length: 200, dtype: int64

While investigating the 200 most popular location we notice that most of them will be left aside. For instance, "California, USA" would be left although it is fairly formalized and could easily be kept.

Let's try to keep the unrecognized American cities in the 200 most popular location.

In [None]:
def get_state(loc):
  flag=False
  if loc == "California, USA":
    loc = 'CA'
    flag = True
    return loc
  elif loc == "Florida, USA":
    loc = 'FL'
    flag = True
    return loc
  elif loc == "Texas, USA":
    loc = 'TX'
    flag = True
    return loc
  elif loc == "New York, USA":
    loc = 'NY'
    flag = True
    return loc
  elif loc == "New York":
    loc = 'NY'
    flag = True
    return loc
  elif loc == "New Jersey, USA":
    loc = 'NJ'
    flag = True
    return loc
  elif loc == "Los Angeles":
    loc = 'CA'
    flag = True
    return loc
  elif loc == "Texas":
    loc = 'TX'
    flag = True
    return loc
  elif loc == "New York City":
    loc = 'NY'
    flag = True
    return loc
  elif loc == "NYC":
    loc = 'NY'
    flag = True
    return loc
  elif loc == "California":
    loc = 'CA'
    flag = True
    return loc
  elif loc == "Michigan, USA":
    loc = 'MI'
    flag = True
    return loc
  elif loc == "Chicago":
    loc = 'IL'
    flag = True
    return loc
  elif loc == "Ohio, USA":
    loc = 'OH'
    flag = True
    return loc
  elif loc == "North Carolina, USA":
    loc = 'NC'
    flag = True
    return loc
  elif loc == "Virginia, USA":
    loc = 'VA'
    flag = True
    return loc
  elif loc == "Florida":
    loc = 'FL'
    flag = True
    return loc
  elif loc == "Arizona, USA":
    loc = 'AZ'
    flag = True
    return loc
  elif loc == "Massachusetts, USA":
    loc = 'MA'
    flag = True
    return loc
  elif loc == "Colorado, USA":
    loc = 'CO'
    flag = True
    return loc
  elif loc == "Georgia, USA":
    loc = 'GA'
    flag = True
    return loc
  elif loc == "Maryland, USA":
    loc = 'MD'
    flag = True
    return loc
  elif loc == "Illinois, USA":
    loc = 'IL'
    flag = True
    return loc
  elif loc == "Washington, USA":
    loc = 'WA'
    flag = True
    return loc
  elif loc == "New Jersey":
    loc = 'NJ'
    flag = True
    return loc
  elif loc == "Wisconsin, USA":
    loc = 'WI'
    flag = True
    return loc
  elif loc == "Connecticut, USA":
    loc = 'CT'
    flag = True
    return loc
  elif loc == "Michigan":
    loc = 'MI'
    flag = True
    return loc
  elif loc == "Oregon, USA":
    loc = 'OR'
    flag = True
    return loc
  elif loc == "Indiana, USA":
    loc = 'IN'
    flag = True
    return loc
  elif loc == "Tennessee, USA":
    loc = 'TN'
    flag = True
    return loc
  elif loc == "Southern California":
    loc = 'CA'
    flag = True
    return loc
  elif loc == "South Carolina, USA":
    loc = 'SC'
    flag = True
    return loc
  elif loc == "Ohio":
    loc = 'OH'
    flag = True
    return loc
  elif loc == "Colorado":
    loc = 'CO'
    flag = True
    return loc
  elif loc == "Missouri, USA":
    loc = 'MO'
    flag = True
    return loc
  elif loc == "Kentucky, USA":
    loc = 'KY'
    flag = True
    return loc
  elif loc == "North Carolina":
    loc = 'NC'
    flag = True
    return loc
  elif loc == "Arizona":
    loc = 'AZ'
    flag = True
    return loc
  else:
    return loc

In [None]:
twbert["user_location"]= twbert["user_location"].progress_apply(get_state)

  0%|          | 0/8947664 [00:00<?, ?it/s]

In [None]:
states = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
       'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN',
       'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
       'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA',
       'WI', 'WV', 'WY']

In [None]:
def state(val):
  flag=False
  for state in states:
    if val.endswith(" "+state):
      flag=True
      return val

  if flag==False:
    return "None"

In [None]:
twbert["user_location"]= twbert["user_location"].progress_apply(state)

  0%|          | 0/8947664 [00:00<?, ?it/s]

In [None]:
twbert = twbert.replace(to_replace='None', value=np.nan).dropna(subset=["user_location"])

In [None]:
tw["state"] = tw.user_location.apply(lambda x: x.split(",")[-1])

In [None]:
twbert["user_location"].value_counts()[:200]

Washington, DC       67712
Los Angeles, CA      67588
New York, NY         62422
Chicago, IL          42162
Atlanta, GA          30138
                     ...  
Overland Park, KS      839
Columbia, MO           831
Springfield, IL        829
Newport Beach, CA      825
Rochester, MN          823
Name: user_location, Length: 200, dtype: int64

In [None]:
print(twbert.shape)
print(twbert[twbert.BERT_label == 1].shape)

(1571264, 4)
(5472, 4)


In [None]:
twbert.to_csv('15m_cleaned_tweets.csv')

In [None]:
from google.colab import files
files.download('15m_cleaned_tweets.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>