# Preprocessing 'covid19_tweets' dataset

In [17]:
PATH = r''
file_name = 'covid19_tweets.csv'
import os
os.chdir(PATH)

In [None]:
%run setup.ipynb

In [18]:
dataset = pd.read_csv(f'{PATH}/{file_name}')
dataset.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [25]:
corpus = dataset['text']
corpus

0         If I smelled the scent of hand sanitizers toda...
1         Hey @Yankees @YankeesPR and @MLB - wouldn't it...
2         @diane3443 @wdunlap @realDonaldTrump Trump nev...
3         @brookbanktv The one gift #COVID19 has give me...
4         25 July : Media Bulletin on Novel #CoronaVirus...
                                ...                        
179103    Thanks @IamOhmai for nominating me for the @WH...
179104    2020! The year of insanity! Lol! #COVID19 http...
179105    @CTVNews A powerful painting by Juan Lucena. I...
179106    More than 1,200 students test positive for #CO...
179107    I stop when I see a Stop\n\n@SABCNews\n@Izinda...
Name: text, Length: 179108, dtype: object

## Cleaning

Case folding and removing tags, accented characters, punctuation, links, ashtags and tags before the translation.

In [6]:
def text_clean(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' %re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w', '', text)
    text = re.sub(r'http\w+', '', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = contractions.fix(text)
    text = ' '.join(text.split()) 
    return text

In [27]:
cleaned_corpus = pd.DataFrame(corpus.apply(lambda x: text_clean(x)))
cleaned_corpus

Unnamed: 0,text
0,if i smelled the scent of hand sanitizers toda...
1,hey and would not it have made more sense to h...
2,trump never once claimed was a hoax we all cla...
3,the one gift has give me is an appreciation fo...
4,july media bulletin on novel
...,...
179103,thanks for nominating me for the challenge i n...
179104,the year of insanity lol
179105,a powerful painting by juan lucena its a tribu...
179106,more than students test positive for at major ...


## Translation

In [28]:
from deep_translator import GoogleTranslator

def translate_text(df):
    translator = GoogleTranslator(source = 'auto', target = 'en')
    non_translatable = []
    translated_corpus = df.copy()
    
    for i, text in enumerate(df):
        try:
            translated_text = translator.translate(text)
            translated_corpus[i] = translated_text
        except Exception:
            translated_corpus[i] = df[i]
            non_translatable.append(i)
    translated_corpus = pd.DataFrame(translated_corpus.text)
    return translated_corpus, non_translatable

In [29]:
transleted_corpus, non_trans_indeces = translate_text(cleaned_corpus)
transleted_corpus

Unnamed: 0,text
0,if i smelled the scent of hand sanitizers toda...
1,hey and would not it have made more sense to h...
2,trump never once claimed was a hoax we all cla...
3,the one gift has give me is an appreciation fo...
4,july media bulletin on novel
...,...
179103,thanks for nominating me for the challenge i n...
179104,the year of insanity lol
179105,a powerful painting by juan lucena its a tribu...
179106,more than students test positive for at major ...


## Stopwords removal

Removing stopwords that include even covid related words, repeated documents and documents with a length less than 4.

In [30]:
def stopwords_removal_with_covid(corpus):
    stop_w = stopwords.words('english')
    stop_w.extend(["covid", "coronavirus", "sars","cov"])
    cleaned_corpus = [[word for word in doc.split() if word not in stop_w and len(word)>2] for doc in corpus]
    cleaned_corpus = [' '.join(doc) for doc in cleaned_corpus if len(doc) > 3]
    cleaned_corpus = list(set(cleaned_corpus))
    return cleaned_corpus

In [31]:
cleaned_corpus_off = stopwords_removal_with_covid(transleted_corpus.text)
cleaned_corpus_off = pd.DataFrame(cleaned_corpus_off, columns = ['text'])
cleaned_corpus_off

Unnamed: 0,text
0,credit card spend declines enhanced unemployme...
1,welcome back mota bhai home minister tests neg...
2,govt supposed believe care health believing
3,india home minister tests negative
4,one slipup western australia fkd
...,...
144885,update since february fleet increased flown
144886,nations favourite teacher joe wicks helped kee...
144887,get defeated outer challenges inner weaknesses
144888,companies costliest cities paying least per ce...


A stopwords removal process excluding covid related terms, just for visual representation purposes.

In [32]:
def stopwords_removal_no_covid(corpus):
    stop_w = set(stopwords.words('english'))
    cleaned_corpus = [[word for word in str(doc).split() if word not in stop_w] for doc in corpus]
    cleaned_corpus = [' '.join(x) for x in cleaned_corpus]
    return cleaned_corpus

In [33]:
cleaned_corpus_for_graph = stopwords_removal_no_covid(transleted_corpus.text)
cleaned_corpus_for_graph = pd.DataFrame(cleaned_corpus_off, columns = ['text'])
cleaned_corpus_for_graph

Unnamed: 0,text
0,credit card spend declines enhanced unemployme...
1,welcome back mota bhai home minister tests neg...
2,govt supposed believe care health believing
3,india home minister tests negative
4,one slipup western australia fkd
...,...
144885,update since february fleet increased flown
144886,nations favourite teacher joe wicks helped kee...
144887,get defeated outer challenges inner weaknesses
144888,companies costliest cities paying least per ce...


## Lemmatization

Lemmatization using the package SpaCy.

In [13]:
nlp = spacy.load("en_core_web_sm")

def spacy_lemma(tweet):
    lemmas = [word.lemma_ for word in nlp(tweet)]
    return " ".join(lemmas)

In [35]:
lemmatized_corpus = pd.DataFrame(cleaned_corpus_off.text.apply(lambda x: spacy_lemma(x)))
lemmatized_corpus

Unnamed: 0,text
0,credit card spend decline enhance unemployment...
1,welcome back mota bhai home minister test nega...
2,govt suppose believe care health believe
3,india home minister test negative
4,one slipup western australia fkd
...,...
144885,update since february fleet increase fly
144886,nation favourite teacher joe wick help keep wo...
144887,get defeat outer challenge inner weakness
144888,company costliest city pay least per cent whit...


In [36]:
lemmatized_corpus.to_csv('covid19_tweets_off.csv', index=False)

Lemmatizing also the dataset for the graphical representations:

In [37]:
lemmatized_corpus_for_graph = pd.DataFrame(cleaned_corpus_for_graph.text.apply(lambda x: spacy_lemma(x)))
lemmatized_corpus_for_graph

Unnamed: 0,text
0,credit card spend decline enhance unemployment...
1,welcome back mota bhai home minister test nega...
2,govt suppose believe care health believe
3,india home minister test negative
4,one slipup western australia fkd
...,...
144885,update since february fleet increase fly
144886,nation favourite teacher joe wick help keep wo...
144887,get defeat outer challenge inner weakness
144888,company costliest city pay least per cent whit...


In [38]:
lemmatized_corpus_for_graph.to_csv('covid19_tweets_for_graph.csv', index=False)

# Preprocessing 'suspicious_tweets' dataset

In [3]:
file_name = 'suspicious_tweets.csv'

In [4]:
suspicious_tweet = pd.read_csv(f'{PATH}/{file_name}')
suspicious_tweet

Unnamed: 0,message,label
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",1
1,is upset that he can't update his Facebook by ...,1
2,@Kenichan I dived many times for the ball. Man...,1
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",1
...,...,...
59995,"Really wants to go and see 17 again, because Z...",1
59996,@krissa22 Thank you!,1
59997,dreaming of you,1
59998,@TheEllenShow I saw a clip online! good show!,1


## Cleaning

In [7]:
suspicious_tweet_cleaned = pd.DataFrame({'message' : suspicious_tweet.message.apply(lambda x: text_clean(x)), 'label' : suspicious_tweet.label})
suspicious_tweet_cleaned

Unnamed: 0,message,label
0,awww that is a bummer you shoulda got david ca...,1
1,is upset that he cannot update his facebook by...,1
2,i dived many times for the ball managed to sav...,1
3,my whole body feels itchy and like its on fire,0
4,no its not behaving at all i am mad why am i h...,1
...,...,...
59995,really wants to go and see again because zac e...,1
59996,thank you,1
59997,dreaming of you,1
59998,i saw a clip online good show,1


## Translation

In [8]:
from deep_translator import GoogleTranslator

def translate_text_sus(df):
    translator = GoogleTranslator(source = 'auto', target = 'en')
    non_translatable = []
    translated_corpus = df.copy()
    
    for i, message in enumerate(df):
        try:
            translated_text = translator.translate(message)
            translated_corpus[i] = translated_text
        except Exception:
            translated_corpus[i] = df[i]
            non_translatable.append(i)
    translated_corpus = pd.DataFrame(translated_corpus.message)
    return translated_corpus, non_translatable

In [9]:
translated_corpus_sus, non_translatable_sus = translate_text_sus(suspicious_tweet_cleaned)
suspicious_tweet_cleaned = pd.DataFrame({'message': translated_corpus_sus['message'].tolist(), 'label': suspicious_tweet_cleaned.drop(non_translatable_sus)['label'].tolist()})
suspicious_tweet_cleaned

Unnamed: 0,message,label
0,awww that is a bummer you shoulda got david ca...,1
1,is upset that he cannot update his facebook by...,1
2,i dived many times for the ball managed to sav...,1
3,my whole body feels itchy and like its on fire,0
4,no its not behaving at all i am mad why am i h...,1
...,...,...
59995,really wants to go and see again because zac e...,1
59996,thank you,1
59997,dreaming of you,1
59998,i saw a clip online good show,1


## Stopwords removal

In [10]:
def stopwords_removal(corpus):
    stop_w = stopwords.words('english')
    stop_w.extend(["cannot", "not", "covid", "coronavirus", "sars","cov"])
    cleaned_corpus = []
    removed_idx = []

    for i, row in corpus.iterrows():
        doc = row['message']
        words = [word for word in str(doc).split() if word not in stop_w and len(word)>2]
        cleaned_doc = ' '.join(words)
        if len(cleaned_doc.split()) > 3:
            cleaned_corpus.append(cleaned_doc)
        else:
            removed_idx.append(i)
    label = list(corpus.drop(removed_idx)['label'])
    final_df = pd.DataFrame({'message' : cleaned_corpus, 'label' : label})
    
    return final_df

In [11]:
suspicious_tweet_cleaned = stopwords_removal(suspicious_tweet_cleaned)
suspicious_tweet_cleaned

Unnamed: 0,message,label
0,awww bummer shoulda got david carr third day,1
1,upset update facebook texting might cry result...,1
2,dived many times ball managed save rest bounds,1
3,whole body feels itchy like fire,0
4,hey long time see yes rains bit bit lol fine t...,1
...,...,...
45589,awakee dressed ready gooo haha lol,1
45590,gorgeous weather seem manchester,1
45591,well days almost think going watch dvds probab...,1
45592,really wants see zac efron amazingly fit,1


## Lemmatization

In [14]:
suspicious_tweet_cleaned = pd.DataFrame({'message' : suspicious_tweet_cleaned.message.apply(lambda x: spacy_lemma(x)), 'label' : suspicious_tweet_cleaned.label})
suspicious_tweet_cleaned

Unnamed: 0,message,label
0,awww bummer shoulda get david carr third day,1
1,upset update facebook texting might cry result...,1
2,dive many time ball manage save rest bound,1
3,whole body feel itchy like fire,0
4,hey long time see yes rain bit bit lol fine thank,1
...,...,...
45589,awakee dress ready gooo haha lol,1
45590,gorgeous weather seem manchester,1
45591,well day almost think go watch dvds probably a...,1
45592,really want see zac efron amazingly fit,1


In [15]:
suspicious_tweet_cleaned.to_csv('suspicious_tweets_off.csv', index=False)