In [39]:
# Import modules
import pandas as pd

In [40]:
# Read csv into dataframe
df = pd.read_csv('data/covid19_tweets.csv')
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [41]:
# extract comments column
comments = df.loc[:, 'text']
comments

0         If I smelled the scent of hand sanitizers toda...
1         Hey @Yankees @YankeesPR and @MLB - wouldn't it...
2         @diane3443 @wdunlap @realDonaldTrump Trump nev...
3         @brookbanktv The one gift #COVID19 has give me...
4         25 July : Media Bulletin on Novel #CoronaVirus...
                                ...                        
179103    Thanks @IamOhmai for nominating me for the @WH...
179104    2020! The year of insanity! Lol! #COVID19 http...
179105    @CTVNews A powerful painting by Juan Lucena. I...
179106    More than 1,200 students test positive for #CO...
179107    I stop when I see a Stop\n\n@SABCNews\n@Izinda...
Name: text, Length: 179108, dtype: object

In [42]:
# Remove any null input
comments.isnull().sum()

0

In [43]:
# Create a new dataframe for categorising the data
formattedComments = pd.DataFrame(columns=['comment', 'toxic'])

# Remove any non full comments
formattedComments['comment'] = comments[~comments.str.contains('https')]
# Reset the index
formattedComments = formattedComments.reset_index().loc[:, 'comment':'toxic']

with pd.option_context('display.max_colwidth', None):
    print("Before: " + comments.loc[0])
    print ("After: " + formattedComments.loc[0,'comment'])

Before: If I smelled the scent of hand sanitizers today on someone in the past, I would think they were so intoxicated that… https://t.co/QZvYbrOgb0
After: Praying for good health and recovery of @ChouhanShivraj .
#covid19
#covidPositive


In [44]:
# Open bad word list
with open('data/bad-words.txt', 'r') as fileopen:
    badWordList = [line.strip() for line in fileopen]
badWordList

['abbo',
 'abo',
 'abortion',
 'abuse',
 'addict',
 'addicts',
 'adult',
 'africa',
 'african',
 'alla',
 'allah',
 'alligatorbait',
 'amateur',
 'american',
 'anal',
 'analannie',
 'analsex',
 'angie',
 'angry',
 'anus',
 'arab',
 'arabs',
 'areola',
 'argie',
 'aroused',
 'arse',
 'arsehole',
 'asian',
 'ass',
 'assassin',
 'assassinate',
 'assassination',
 'assault',
 'assbagger',
 'assblaster',
 'assclown',
 'asscowboy',
 'asses',
 'assfuck',
 'assfucker',
 'asshat',
 'asshole',
 'assholes',
 'asshore',
 'assjockey',
 'asskiss',
 'asskisser',
 'assklown',
 'asslick',
 'asslicker',
 'asslover',
 'assman',
 'assmonkey',
 'assmunch',
 'assmuncher',
 'asspacker',
 'asspirate',
 'asspuppies',
 'assranger',
 'asswhore',
 'asswipe',
 'athletesfoot',
 'attack',
 'australian',
 'babe',
 'babies',
 'backdoor',
 'backdoorman',
 'backseat',
 'badfuck',
 'balllicker',
 'balls',
 'ballsack',
 'banging',
 'baptist',
 'barelylegal',
 'barf',
 'barface',
 'barfface',
 'bast',
 'bastard',
 'bazongas

In [45]:
# Categorise data depending on if it contains any of the bad words
pattern = fr"\b(?:{'|'.join(badWordList)})\b"

formattedComments['toxic'] = formattedComments['comment'].str.contains(pattern, case=False)
formattedComments

Unnamed: 0,comment,toxic
0,Praying for good health and recovery of @Chouh...,False
1,July 25 #COVID19 update\n#TamilNadu - 6988\nDi...,False
2,Why has Ruto not eulogisied Mkapa!! \nAsking f...,False
3,Crazy that the world has come to this but as A...,True
4,@jimsciutto @JoAnnBaldwin55 People whose relat...,True
...,...,...
12378,A new UNAIDS report warns of the dangers of fa...,False
12379,210 new cases and 4 new deaths in Namibia \n\n...,False
12380,I wonder how many of those anti mask protester...,False
12381,COVID UPDATE: Health officials report 57 new #...,False


In [46]:
formattedComments[formattedComments['toxic']==True].count(axis=0)

comment    2506
toxic      2506
dtype: int64

In [47]:
formattedComments[formattedComments['toxic']==False].count(axis=0)

comment    9877
toxic      9877
dtype: int64

In [57]:
# Pull 2500 rows of each category for training data
trainingData = pd.DataFrame(columns=['comment', 'toxic'])
clean = formattedComments[formattedComments['toxic']==False].reset_index()
toxic = formattedComments[formattedComments['toxic']==True].reset_index()
trainingData = pd.concat([toxic.loc[0:1999], clean.loc[0:1999]]).reset_index().loc[:, 'comment':'toxic']
trainingData

Unnamed: 0,comment,toxic
0,Crazy that the world has come to this but as A...,True
1,@jimsciutto @JoAnnBaldwin55 People whose relat...,True
2,@RepMattGaetz @realDonaldTrump @GaetzTakes How...,True
3,@TheDailyEdge @seanhannity @FoxNews So he can ...,True
4,"#COVID19: Delhi reported 1,142 new cases and 2...",True
...,...,...
3995,The total number of #COVID19 samples tested up...,False
3996,Wearing a mask screws up with the #socialcredi...,False
3997,422 new cases and 28 new deaths in Kyrgyzstan ...,False
3998,where are these people gonna evacuate TO???!!!...,False


In [58]:
# Pull 5 rows of each category for testing data
testingData = pd.DataFrame(columns=['comment', 'toxic'])
testingData = pd.concat([toxic.loc[2500:2504, 'comment':'toxic'], clean.loc[2500:2504, 'comment':'toxic']]).reset_index().loc[:, 'comment':'toxic']
testingData

Unnamed: 0,comment,toxic
0,Seychelles has no active cases of the novel co...,True
1,"1,465 new cases and 12 new deaths in Israel \n...",True
2,@itsJeffTiedrich @realDonaldTrump Someone shou...,True
3,"@realDonaldTrump You blew the numbers ... 181,...",True
4,63 new cases and 1 new death in Mozambique \n\...,True
5,@StevenStackMD says for some people #COVID19 w...,False
6,@theadvocatebr @NOLAnews $10 it won’t run to c...,False
7,"Fed Chair Jerome H. Powell says, the Economy’s...",False
8,Joe Scarborough Questions Trump’s Hydroxychlor...,False
9,.@SEC going to a 10 game conference-only sched...,False


In [59]:
# Write the data to json files
with open('data/trainingData.json', 'w') as f:
    f.write(trainingData.to_json(orient = "records"))
    
with open('data/testingData.json', 'w') as f:
    f.write(testingData.to_json(orient = "records"))
    
print('Successfully stored json files.')

Successfully stored json files.
