In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [21]:
df = pd.read_csv('twitter.csv')
df = df[['Text', 'Sentiment']]

print(df.Sentiment.value_counts())

Negative              1040
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     591
Name: Sentiment, dtype: int64


In [22]:
import re
import string

def preprocess(txt):
    txt = txt.lower()
    txt = re.sub("\d", " ", txt)
    txt = re.sub('[%s]' % re.escape(string.punctuation), ' ', txt)
    txt = re.sub('\n', ' ', txt)
    return txt

In [23]:
print(df)
df.Text = df.Text.apply(preprocess)

                                                   Text           Sentiment
0     When I couldn't find hand sanitizer at Fred Me...            Positive
1     Find out how you can protect yourself and love...  Extremely Positive
2     #toiletpaper #dunnypaper #coronavirus #coronav...             Neutral
3     Do you remember the last time you paid $2.99 a...             Neutral
4     Voting in the age of #coronavirus = hand sanit...            Positive
...                                                 ...                 ...
3791  Meanwhile In A Supermarket in Israel -- People...            Positive
3792  Did you panic buy a lot of non-perishable item...            Negative
3793  Asst Prof of Economics @cconces was on @NBCPhi...             Neutral
3794  Gov need to do somethings instead of biar je r...  Extremely Negative
3795  I and @ForestandPaper members are committed to...  Extremely Positive

[3796 rows x 2 columns]


In [24]:
print(df)

                                                   Text           Sentiment
0     when i couldn t find hand sanitizer at fred me...            Positive
1     find out how you can protect yourself and love...  Extremely Positive
2      toiletpaper  dunnypaper  coronavirus  coronav...             Neutral
3     do you remember the last time you paid       a...             Neutral
4     voting in the age of  coronavirus   hand sanit...            Positive
...                                                 ...                 ...
3791  meanwhile in a supermarket in israel    people...            Positive
3792  did you panic buy a lot of non perishable item...            Negative
3793  asst prof of economics  cconces was on  nbcphi...             Neutral
3794  gov need to do somethings instead of biar je r...  Extremely Negative
3795  i and  forestandpaper members are committed to...  Extremely Positive

[3796 rows x 2 columns]


In [25]:
cvec = CountVectorizer()
X_transformed = cvec.fit_transform(df.Text)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, df.Sentiment, test_size=0.2)

In [26]:
def eval_model(y_true, y_predicted):
    print('accuracy score', accuracy_score(y_true, y_predicted))
    print(classification_report(y_true, y_predicted))

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

eval_model(y_test, y_pred)

accuracy score 0.4342105263157895
                    precision    recall  f1-score   support

Extremely Negative       0.53      0.48      0.50       116
Extremely Positive       0.61      0.41      0.49       136
          Negative       0.42      0.39      0.41       217
           Neutral       0.39      0.55      0.46       108
          Positive       0.35      0.40      0.37       183

          accuracy                           0.43       760
         macro avg       0.46      0.45      0.45       760
      weighted avg       0.45      0.43      0.44       760
