# Sentiment Classification of Covid Tweets

In [1]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
raw_df = pd.read_csv('/content/drive/MyDrive/Winter Data Comp/dataset/Corona_NLP_train.csv',encoding='latin1')

## Simple check on data

In [None]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [None]:
# Missing values in Location
raw_df.isnull().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [None]:
raw_df.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,As news of the regionÂs first confirmed COVID...,Positive
6,3805,48757,"35.926541,-78.753267",16-03-2020,Cashier at grocery store was sharing his insig...,Positive
7,3806,48758,Austria,16-03-2020,Was at the supermarket today. Didn't buy toile...,Neutral
8,3807,48759,"Atlanta, GA USA",16-03-2020,Due to COVID-19 our retail store and classroom...,Positive
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,"For corona prevention,we should stop to buy th...",Negative


In [None]:
raw_df.Sentiment.value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

In [None]:
# Too much different location -> need to figure out ways to group them 
raw_df.Location.value_counts()

London                         540
United States                  528
London, England                520
New York, NY                   395
Washington, DC                 373
                              ... 
Norwich, Norfolk                 1
?Space Galaxy ?                  1
Haslemere                        1
Muskoka aka Hollywood North      1
Al Rayyan, Qatar                 1
Name: Location, Length: 12220, dtype: int64

In [None]:
# all UserName & ScreenName are unique so these two cols are useless
raw_df[['UserName','ScreenName']].nunique()

UserName      41157
ScreenName    41157
dtype: int64

## Text data preprocessing

In [None]:
tweets = raw_df.OriginalTweet

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# lower case
tweets = tweets.str.lower()
tweets

0        @menyrbie @phil_gahan @chrisitv https://t.co/i...
1        advice talk to your neighbours family to excha...
2        coronavirus australia: woolworths to give elde...
3        my food stock is not the only one which is emp...
4        me, ready to go at supermarket during the #cov...
                               ...                        
41152    airline pilots offering to stock supermarket s...
41153    response to complaint not provided citing covi...
41154    you know itâs getting tough when @kameronwild...
41155    is it wrong that the smell of hand sanitizer i...
41156    @tartiicat well new/used rift s are going for ...
Name: OriginalTweet, Length: 41157, dtype: object

In [None]:
# get all tags & mentions & url
get_tags = lambda x: re.findall('#[A-Za-z0-9_]+',x)
get_mentions = lambda x: re.findall('@[A-Za-z0-9_]+',x)
tags = tweets.map(get_tags)
mentions = tweets.map(get_mentions)

In [None]:
get_urls = lambda x:re.findall('https?\S+',x)
urls = tweets.map(get_urls)

In [None]:
urls

0        [https://t.co/ifz9fan2pa, https://t.co/xx6ghgf...
1                                                       []
2                                [https://t.co/binca9vp8p]
3                                [https://t.co/zrlg0z520j]
4                                [https://t.co/usmualq72n]
                               ...                        
41152                            [https://t.co/cz89ua0hnp]
41153                                                   []
41154                                                   []
41155                                                   []
41156                                                   []
Name: OriginalTweet, Length: 41157, dtype: object

In [None]:
tags.shape

(41157,)

In [None]:
mentions.shape

(41157,)

In [None]:
# remove tags & mentions & hyperlinks 
replace_urls = lambda x: re.sub('https?\S+',' ',x)
replace_tags = lambda x: re.sub('#[A-Za-z0-9_]+','',x)
replace_mentions = lambda x: re.sub('@[A-Za-z0-9_]+','',x)
tweets = tweets.map(replace_urls).map(replace_tags).map(replace_mentions)

In [None]:
# double check
tweets[tweets.str.contains('<br>')]

Series([], Name: OriginalTweet, dtype: object)

In [None]:
tweets.iloc[17992]

'listen look...  \r\r\nhow i was feeling a few days ago!   thanks  buttt whewwww!!!\r\r\n       '

In [None]:
# get html special entities
get_entities = lambda x: re.findall('&[a-zA-Z0-9]+',x)
html_entities = tweets.map(get_entities)
html_entities.value_counts()

TypeError: ignored

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[]                                                         38137
[&amp]                                                      2083
[&amp, &amp]                                                 571
[&amp, &amp, &amp]                                           145
[&gt]                                                         74
[&gt, &gt]                                                    34
[&amp, &amp, &amp, &amp]                                      29
[&gt, &gt, &gt]                                               17
[&lt]                                                         11
[&amp, &gt, &gt, &gt]                                          6
[&amp, &gt]                                                    6
[&gt, &gt, &gt, &gt]                                           5
[&lt, &gt]                                                     5
[&amp, &amp, &gt]                                              3
[&amp, &amp, &amp, &amp, &amp]                                 3
[&gt, &amp]              

In [None]:
# replace other strange stuff like \r \n
useless_stuff = '\\r|\\n|\&amp|\&gt|\&lt'
tweets = tweets.str.replace(useless_stuff,' ')

In [None]:
# replace punctuation 
replace_punc = lambda x: re.sub('[^a-zA-Z0-9]',' ',x)
tweets = tweets.map(replace_punc)

In [None]:
tweets.shape

(41157,)

In [None]:
raw_df.iloc[0].OriginalTweet

'@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8'

In [None]:
# stemming & remove stopwords
ps=PorterStemmer()
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def stemming(x):
  word_lst = x.split()
  word_lst= [ps.stem(word) for word in word_lst if word not in stopwords.words('english')]
  result = " ".join(word_lst)
  return result

In [None]:
test_text = "i you re he s him be is was has and and we ve we ll they ain t and or she is butterfly oh no"
stemming(test_text)

'butterfli oh'

In [None]:
clean_tweets = tweets.map(stemming)

In [None]:
clean_tweets.shape

(41157,)

In [None]:
col_names = ['UserName','ScreenName','Location','TweetAt','OriginalTweet','Sentiment','CleanTweet','Url','Tag','Mention']
clean_set = pd.concat([raw_df,clean_tweets,urls,tags,mentions], axis=1).set_axis(col_names,axis=1)

In [None]:
clean_set.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,CleanTweet,Url,Tag,Mention
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,,"[https://t.co/ifz9fan2pa, https://t.co/xx6ghgf...",[],"[@menyrbie, @phil_gahan, @chrisitv]"
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advic talk neighbour famili exchang phone numb...,[],[],[]
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,coronaviru australia woolworth give elderli di...,[https://t.co/binca9vp8p],[],[]
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,food stock one empti pleas panic enough food e...,[https://t.co/zrlg0z520j],"[#covid19france, #covid_19, #covid19, #coronav...",[]
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,readi go supermarket outbreak paranoid food st...,[https://t.co/usmualq72n],"[#covid19, #coronavirus, #coronavirusfrance, #...",[]


In [None]:
clean_set.iloc[17992]

UserName                                                     21791
ScreenName                                                   66743
Location                                             Las Vegas, NV
TweetAt                                                 23-03-2020
OriginalTweet    Listen look... #toiletpaper \r\r\nHow I was fe...
Sentiment                                                 Positive
CleanTweet            listen look feel day ago thank buttt whewwww
Url                                      [https://t.co/lrsxrlelnt]
Tag              [#toiletpaper, #foundit, #tissue, #coronavirus...
Mention                                              [@cottonelle]
Name: 17992, dtype: object

In [None]:
clean_set.shape

(41157, 10)

In [None]:
# export data as rdata
import feather
feather.write_dataframe(clean_set, '/content/drive/MyDrive/Winter Data Comp/dataset/Corona_NLP_train_clean.feather')

In [None]:
clean_set.to_csv('/content/drive/MyDrive/Winter Data Comp/dataset/Corona_NLP_train_clean2.csv')

In [None]:
clean_set_test = pd.read_csv('/content/drive/MyDrive/Winter Data Comp/dataset/Corona_NLP_train_clean2.csv')

In [None]:
clean_set_test.shape

(41159, 11)

In [None]:
clean_train.tail()

Unnamed: 0.1,Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,CleanTweet,Url,Tag,Mention
41154,41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral,airlin pilot offer stock supermarket shelv loc...,['https://t.co/cz89ua0hnp'],"['#nz', '#covid']",[]
41155,41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative,respons complaint provid cite covid 19 relat d...,[],[],[]
41156,41154,44953,89905,,14-04-2020,You know itÃÂs getting tough when @KameronWi...,Positive,know get tough ration toilet paper martinsvil ...,[],"['#coronavirus', '#toiletpaper']","['@kameronwilds', '@kroger']"
41157,41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral,wrong smell hand sanit start turn,[],"['#coronavirus', '#covid19', '#coronavirus']",[]
41158,41156,44955,89907,i love you so much || he/him,14-04-2020,@TartiiCat Well new/used Rift S are going for ...,Negative,well new use rift go 700 00 amazon rn although...,[],[],['@tartiicat']


In [None]:
clean_set_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41159 entries, 0 to 41158
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41158 non-null  object
 1   ScreenName     41159 non-null  object
 2   Location       32569 non-null  object
 3   TweetAt        41159 non-null  object
 4   OriginalTweet  41159 non-null  object
 5   Sentiment      41156 non-null  object
 6   CleanTweet     41111 non-null  object
 7   Url            41155 non-null  object
 8   Tag            41155 non-null  object
 9   Mention        41155 non-null  object
dtypes: object(10)
memory usage: 3.1+ MB


In [None]:
# wrap all the stuff up in a func for future use

ps=PorterStemmer()
nltk.download('stopwords')

def stemming(x):
  word_lst = x.split()
  word_lst= [ps.stem(word) for word in word_lst if word not in stopwords.words('english')]
  result = " ".join(word_lst)
  return result

def text_cleaning(df):
  tweets = df.OriginalTweet
  tweets = tweets.str.lower()

  get_tags = lambda x: re.findall('#[A-Za-z0-9_]+',x)
  get_mentions = lambda x: re.findall('@[A-Za-z0-9_]+',x)
  get_urls = lambda x:re.findall('https?\S+',x)

  tags = tweets.map(get_tags)
  mentions = tweets.map(get_mentions)
  urls = tweets.map(get_urls)

  replace_urls = lambda x: re.sub('https?\S+',' ',x)
  replace_tags = lambda x: re.sub('#[A-Za-z0-9_]+','',x)
  replace_mentions = lambda x: re.sub('@[A-Za-z0-9_]+','',x)
  replace_punc1 = lambda x: re.sub('[^a-zA-Z0-9]',' ',x)
  replace_punc2 = lambda x: re.sub('\\r|\\n|\&amp|\&gt|\&lt',' ',x)
  tweets = tweets.map(replace_urls).map(replace_tags).map(replace_mentions).map(replace_punc1).map(replace_punc2)

  clean_tweets = tweets.map(stemming)

  col_names = ['UserName','ScreenName','Location','TweetAt','OriginalTweet','Sentiment','CleanTweet','Url','Tag','Mention']
  clean_df = pd.concat([df,clean_tweets,urls,tags,mentions], axis=1).set_axis(col_names,axis=1)

  return clean_df



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
test = pd.read_csv('/content/drive/MyDrive/Winter Data Comp/dataset/Corona_NLP_test.csv',encoding = 'latin1')

In [None]:
test.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [None]:
clean_test = text_cleaning(test)

In [None]:
clean_test.to_csv('/content/drive/MyDrive/Winter Data Comp/dataset/Corona_NLP_test_clean.csv')

In [None]:
raw_df = pd.read_csv('/content/drive/MyDrive/Winter Data Comp/dataset/Corona_NLP_train.csv',encoding='latin1')
clean_train = pd.read_csv('/content/drive/MyDrive/Winter Data Comp/dataset/Corona_NLP_train_clean.csv',encoding='latin1')

In [None]:
raw_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [None]:
clean_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41159 entries, 0 to 41158
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     41158 non-null  object
 1   UserName       41159 non-null  object
 2   ScreenName     41159 non-null  object
 3   Location       32569 non-null  object
 4   TweetAt        41159 non-null  object
 5   OriginalTweet  41158 non-null  object
 6   Sentiment      41155 non-null  object
 7   CleanTweet     41111 non-null  object
 8   Url            41155 non-null  object
 9   Tag            41155 non-null  object
 10  Mention        41155 non-null  object
dtypes: object(11)
memory usage: 3.5+ MB


In [None]:
clean_train.UserName.nunique()

41159

In [None]:
clean_train.UserName.value_counts()

5382     1
43150    1
26967    1
15923    1
21007    1
        ..
24408    1
15098    1
18927    1
34524    1
22307    1
Name: UserName, Length: 41159, dtype: int64

## Other features labeling

In [3]:
train = pd.read_csv("/content/drive/MyDrive/Winter Data Comp/dataset/final_dataset.csv",encoding = 'latin1')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   index          41157 non-null  int64 
 1   UserName       41157 non-null  int64 
 2   ScreenName     41157 non-null  int64 
 3   Location       32567 non-null  object
 4   TweetAt        41157 non-null  object
 5   OriginalTweet  41157 non-null  object
 6   Sentiment      41157 non-null  object
 7   CleanTweet     41113 non-null  object
 8   Url            41157 non-null  object
 9   Tag            41157 non-null  object
 10  Mention        41157 non-null  object
 11  Num_AT         41157 non-null  int64 
 12  Num_EX         41157 non-null  int64 
 13  Num_HPLink     41157 non-null  int64 
 14  Num_Q          41157 non-null  int64 
 15  Num_TAG        41157 non-null  int64 
dtypes: int64(8), object(8)
memory usage: 5.0+ MB


In [16]:
from sklearn.preprocessing import OrdinalEncoder
cat = list(np.array(['Extremely Negative','Negative', 'Neutral', 'Positive','Extremely Positive']).reshape(1,5))
ordenc = OrdinalEncoder(categories=cat)

In [18]:
sentiment = np.array(train.Sentiment).reshape(-1, 1)
train['SentimentLabeled'] = ordenc.fit_transform(sentiment)
ordenc.categories_

[array(['Extremely Negative', 'Negative', 'Neutral', 'Positive',
        'Extremely Positive'], dtype=object)]

In [19]:
train.head(10)

Unnamed: 0,index,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,CleanTweet,Url,Tag,Mention,Num_AT,Num_EX,Num_HPLink,Num_Q,Num_TAG,SentimentLabeled
0,0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,,"['https://t.co/ifz9fan2pa', 'https://t.co/xx6g...",[],"['@menyrbie', '@phil_gahan', '@chrisitv']",3,0,3,0,0,2.0
1,1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advic talk neighbour famili exchang phone numb...,[],[],[],0,0,0,0,0,3.0
2,2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,coronaviru australia woolworth give elderli di...,['https://t.co/binca9vp8p'],[],[],0,0,1,0,0,3.0
3,3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,food stock one empti pleas panic enough food e...,['https://t.co/zrlg0z520j'],"['#covid19france', '#covid_19', '#covid19', '#...",[],0,0,1,0,7,3.0
4,4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,readi go supermarket outbreak paranoid food st...,['https://t.co/usmualq72n'],"['#covid19', '#coronavirus', '#coronavirusfran...",[],0,0,1,0,6,0.0
5,5,3804,48756,"ÃÂT: 36.319708,-82.363649",16-03-2020,As news of the regionÃÂs first confirmed COV...,Positive,news region first confirm covid 19 case came s...,['https://t.co/cfxch7a2lu'],[],['@tim_dodson'],1,0,1,0,0,3.0
6,6,3805,48757,"35.926541,-78.753267",16-03-2020,Cashier at grocery store was sharing his insig...,Positive,cashier groceri store share insight prove cred...,['https://t.co/iefdnehgdo'],['#covid_19'],[],0,0,1,2,1,3.0
7,7,3806,48758,Austria,16-03-2020,Was at the supermarket today. Didn't buy toile...,Neutral,supermarket today buy toilet paper,['https://t.co/evxkqlidaz'],"['#rebel', '#toiletpapercrisis', '#covid_19']",[],0,0,1,0,3,2.0
8,8,3807,48759,"Atlanta, GA USA",16-03-2020,Due to COVID-19 our retail store and classroom...,Positive,due covid 19 retail store classroom atlanta op...,['https://t.co/kw91zj5o5i'],[],[],0,2,1,0,0,3.0
9,9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,"For corona prevention,we should stop to buy th...",Negative,corona prevent stop buy thing cash use onlin p...,[],"['#govindia', '#indiafightscorona']",[],0,0,0,0,2,1.0
