# Twitter Sentimental Analysis

In [1]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
df = pd.read_csv('drive/My Drive/Colab Notebooks/twitter sentiment/Tweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
# For sentimental analysis I just want the actual tweet and the label for it

tweet = df[['airline_sentiment', 'text']].copy()
tweet.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [4]:
tweet.isnull().sum()

airline_sentiment    0
text                 0
dtype: int64

In [5]:
blanks = []

# (index,label, review text)
for i,lb,rv in tweet.itertuples():
  if rv.isspace():
    blanks.append(i)

blanks

[]

In [0]:
# There are no missing values and there are no blanks within the text so we can proceed with the sentiment analysis

In [7]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




In [0]:
tweet['score'] = tweet['text'].apply(lambda review:sid.polarity_scores(review))
tweet['compound'] = tweet['score'].apply(lambda d:d['compound'])
tweet['comp_score'] = tweet['compound'].apply(lambda score: 'positive' if score > 0 else ('neutral' if score == 0.00 else 'negative'))

In [9]:
tweet.head()

# If the compound score is 0 it is neutral, >0 positive, <0 negative

# Now that we caluclating the sentimental analysis via NLTK let's see how it
# it fairs via confusion matrix and classification reports

Unnamed: 0,airline_sentiment,text,score,compound,comp_score
0,neutral,@VirginAmerica What @dhepburn said.,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,neutral
1,positive,@VirginAmerica plus you've added commercials t...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,neutral
2,neutral,@VirginAmerica I didn't today... Must mean I n...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,neutral
3,negative,@VirginAmerica it's really aggressive to blast...,"{'neg': 0.246, 'neu': 0.754, 'pos': 0.0, 'comp...",-0.5984,negative
4,negative,@VirginAmerica and it's a really big bad thing...,"{'neg': 0.321, 'neu': 0.679, 'pos': 0.0, 'comp...",-0.5829,negative


In [10]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print(accuracy_score(tweet['airline_sentiment'] ,tweet['comp_score']))
print(classification_report(tweet['airline_sentiment'] , tweet['comp_score']))

# We can see that our analysis is only 50% accurate however this is just
# a simple method of using nltk, let's try using machine learning

0.5465163934426229
              precision    recall  f1-score   support

    negative       0.90      0.50      0.65      9178
     neutral       0.40      0.42      0.41      3099
    positive       0.33      0.87      0.48      2363

    accuracy                           0.55     14640
   macro avg       0.54      0.60      0.51     14640
weighted avg       0.70      0.55      0.57     14640



In [11]:
tweet.comp_score.value_counts()

#there is an imbalancement so we will have to balance the dataset and we want to
# use our generated comp_score because in the real world there is no label if
# the sentiment is positive, neutral or negative

positive    6222
negative    5153
neutral     3265
Name: comp_score, dtype: int64

In [12]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = tweet[tweet.comp_score =='positive']
df_minority = tweet[tweet.comp_score =='negative']
df_minority1 = tweet[tweet.comp_score == 'neutral']
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=6222,    # to match majority class
                                 random_state=123) # reproducible results

df_minority1_upsampled = resample(df_minority1, 
                                 replace=True,    
                                 n_samples=6222,    
                                 random_state=123)
 
# Combine majority class with upsampled minority class
new_tweet = pd.concat([df_majority, df_minority_upsampled , df_minority1_upsampled])
 
# Display new class counts
new_tweet.comp_score.value_counts()

neutral     6222
negative    6222
positive    6222
Name: comp_score, dtype: int64

In [0]:
from sklearn.model_selection import train_test_split

X = new_tweet['text' ]
y = new_tweet['comp_score']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33,
                                                   random_state = 42)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TfidfVectorizer combines the steps of countvectorization and tfidftransformer
# Basically it learns the vocabulary, counts the number of words then transform
# X_train to a numerical vector while also gives important words more weight

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer(max_df= 0.95, min_df= 2, 
                                               stop_words= 'english')), 
                     ('clf' , LinearSVC())])
text_clf.fit(X_train,y_train)
predictions = text_clf.predict(X_test)

In [29]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

    negative       0.84      0.83      0.83      2057
     neutral       0.85      0.92      0.88      2075
    positive       0.85      0.79      0.82      2028

    accuracy                           0.85      6160
   macro avg       0.85      0.84      0.84      6160
weighted avg       0.85      0.85      0.84      6160



In [30]:
print(accuracy_score(y_test ,predictions))

0.8452922077922078


In [31]:
from sklearn.linear_model import LogisticRegression

text_clf1 = Pipeline([('tfidf', TfidfVectorizer(max_df= 0.95, min_df= 2, 
                                               stop_words= 'english')), 
                     ('Log' , LogisticRegression())])
text_clf1.fit(X_train,y_train)
predictions1 = text_clf1.predict(X_test)
print(classification_report(y_test, predictions1))
print(accuracy_score(y_test ,predictions1))



              precision    recall  f1-score   support

    negative       0.81      0.78      0.79      2057
     neutral       0.79      0.92      0.85      2075
    positive       0.85      0.74      0.79      2028

    accuracy                           0.81      6160
   macro avg       0.82      0.81      0.81      6160
weighted avg       0.82      0.81      0.81      6160

0.8128246753246753


In [32]:
from sklearn.tree import DecisionTreeClassifier

text_clf2 = Pipeline([('tfidf', TfidfVectorizer(max_df= 0.95, min_df= 2, 
                                               stop_words= 'english')), 
                     ('DT' , DecisionTreeClassifier())])
text_clf2.fit(X_train,y_train)
predictions2 = text_clf2.predict(X_test)
print(classification_report(y_test, predictions2))
print(accuracy_score(y_test ,predictions2))

              precision    recall  f1-score   support

    negative       0.81      0.81      0.81      2057
     neutral       0.79      0.92      0.85      2075
    positive       0.83      0.68      0.75      2028

    accuracy                           0.80      6160
   macro avg       0.81      0.80      0.80      6160
weighted avg       0.81      0.80      0.80      6160

0.8048701298701298


In [33]:
from sklearn.ensemble import RandomForestClassifier

text_clf3 = Pipeline([('tfidf', TfidfVectorizer(max_df= 0.95, min_df= 2, 
                                               stop_words= 'english')), 
                     ('RF' , RandomForestClassifier(n_estimators=100))])
text_clf3.fit(X_train,y_train)
predictions3 = text_clf3.predict(X_test)
print(classification_report(y_test, predictions3))
print(accuracy_score(y_test ,predictions3))

              precision    recall  f1-score   support

    negative       0.86      0.83      0.84      2057
     neutral       0.78      0.97      0.87      2075
    positive       0.90      0.72      0.80      2028

    accuracy                           0.84      6160
   macro avg       0.85      0.84      0.84      6160
weighted avg       0.85      0.84      0.84      6160

0.8392857142857143


In [0]:
# As we can see the highest accuracy was with LinearSVC at around 84%
# We can test out our prediction

In [108]:
for i in range(0,20):  
  print("Label of tweet",[tweet.iloc[:,0][i]])
  print("Actual tweet",[tweet.iloc[:,1][i]])
  print("Computed Score of tweet",[tweet.iloc[:,3][i]])
  print("Predicted label",text_clf.predict([tweet.iloc[:,1][i]]))
  print('\n')

Label of tweet ['neutral']
Actual tweet ['@VirginAmerica What @dhepburn said.']
Computed Score of tweet [0.0]
Predicted label ['neutral']


Label of tweet ['positive']
Actual tweet ["@VirginAmerica plus you've added commercials to the experience... tacky."]
Computed Score of tweet [0.0]
Predicted label ['neutral']


Label of tweet ['neutral']
Actual tweet ["@VirginAmerica I didn't today... Must mean I need to take another trip!"]
Computed Score of tweet [0.0]
Predicted label ['neutral']


Label of tweet ['negative']
Actual tweet ['@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse']
Computed Score of tweet [-0.5984]
Predicted label ['positive']


Label of tweet ['negative']
Actual tweet ["@VirginAmerica and it's a really big bad thing about it"]
Computed Score of tweet [-0.5829]
Predicted label ['negative']


Label of tweet ['negative']
Actual tweet ["@VirginAmerica seriously would pay $30 a flight for seats 

In [0]:
# For the most part our predictive label matches with the actual label
# Some of the improvements would be replacing emojis with words
# Taking out hashtags