In [1]:
import pandas as pd
import re, string
import emoji
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('Corona_NLP_test.csv')
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       3798 non-null   int64 
 1   ScreenName     3798 non-null   int64 
 2   Location       2964 non-null   object
 3   TweetAt        3798 non-null   object
 4   OriginalTweet  3798 non-null   object
 5   Sentiment      3798 non-null   object
dtypes: int64(2), object(4)
memory usage: 178.2+ KB


In [4]:
df['Sentiment'].unique()

array(['Extremely Negative', 'Positive', 'Extremely Positive', 'Negative',
       'Neutral'], dtype=object)

In [5]:
df = df[['OriginalTweet','Sentiment']]
df.head()

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [6]:
#data cleaning

def strip_emoji(text):
    return emoji.replace_emoji(text,r'')

def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)


In [7]:
texts_new = []
for t in df.OriginalTweet:
    texts_new.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(strip_emoji(t))))))

In [8]:
df['text_clean'] = texts_new
df['text_clean'].head()

0    trending new yorkers encounter empty supermark...
1    when i couldnt find hand sanitizer at fred mey...
2    find out how you can protect yourself and love...
3    panic buying hits newyork city as anxious shop...
4    toiletpaper dunnypaper coronavirus coronavirus...
Name: text_clean, dtype: object

In [9]:
df = df.drop('OriginalTweet',axis=1)
df.head()

Unnamed: 0,Sentiment,text_clean
0,Extremely Negative,trending new yorkers encounter empty supermark...
1,Positive,when i couldnt find hand sanitizer at fred mey...
2,Extremely Positive,find out how you can protect yourself and love...
3,Negative,panic buying hits newyork city as anxious shop...
4,Neutral,toiletpaper dunnypaper coronavirus coronavirus...


In [10]:
def sentiment_to_int(sentiment):
    if sentiment == 'Extremely Negative':
        return 0
    elif sentiment == 'Negative':
        return 0
    elif sentiment == 'Neutral':
        return 1
    elif sentiment == 'Positive':
        return 1
    elif sentiment == 'Extremely Positive':
        return 1


In [11]:
df['Sentiment'] = df['Sentiment'].apply( lambda x: sentiment_to_int(x))

In [12]:
df.head()

Unnamed: 0,Sentiment,text_clean
0,0,trending new yorkers encounter empty supermark...
1,1,when i couldnt find hand sanitizer at fred mey...
2,1,find out how you can protect yourself and love...
3,0,panic buying hits newyork city as anxious shop...
4,1,toiletpaper dunnypaper coronavirus coronavirus...


In [13]:
print(df['text_clean'][0])

trending new yorkers encounter empty supermarket shelves pictured wegmans in brooklyn soldout online grocers foodkick maxdelivery as coronavirusfearing shoppers stock up


In [14]:
X = df['text_clean']
y = df['Sentiment']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
print(X_train.shape)
print(y_train.shape)

(2658,)
(2658,)


In [79]:
tvec = CountVectorizer(ngram_range=(1,3)).fit(X_train)

In [80]:
len(tvec.get_feature_names())



124165

In [81]:
X_train_vectorized = tvec.transform(X_train)

In [82]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression()

In [83]:
predictions = model.predict(tvec.transform(X_test))
print(predictions)

[0 0 1 ... 1 1 0]


In [84]:
print(y_test)

1318    0
2899    0
2423    1
2145    1
2108    0
       ..
1830    0
2774    1
1395    1
1896    1
3504    1
Name: Sentiment, Length: 1140, dtype: int64


In [85]:
model.score(tvec.transform(X_test), y_test)

0.7236842105263158

In [86]:
import numpy as np
feature_names = np.array(tvec.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()



In [87]:
print("Negative words", feature_names[sorted_coef_index[:10]])

Negative words ['panic' 'no' 'empty' 'crisis' 'stop' 'demand' 'sick' 'emergency' 'fear'
 'bad']


In [88]:
print("Positive words", feature_names[sorted_coef_index[:-11:-1]])

Positive words ['hand' 'good' 'like' 'please' 'help' 'great' 'wont' 'free' 'safe'
 'of food']


In [93]:
#test on custom tweet
tweet = "people are not in panic"
pred = model.predict(tvec.transform([tweet]))
print(pred)

[0]
