In [1]:
# Jolie McDonnell 12/28/20 TASK 1: NLP Sentiment Analysis

# SOURCES: 
VADER: https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671; https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/

BERT: https://huggingface.co/ipuneetrathore/bert-base-cased-finetuned-finBERT

SKLEARN: https://stackabuse.com/python-for-nlp-sentiment-analysis-with-scikit-learn/

In [2]:
test_data = ["Today is a good day!",
             "My friend is a snake..",
             "I'm so pissed",
             "Why did you do that?",
             "This product was horrible and broke in 5 minutes",
             "I love hot cocoa and sledding but the snow hurts my toes",
             "OMG I want a puppy!", 
             "I wish I could be happier and spread kindness.",
             "My favorite color is blue.",
             "You're disgusting and mean and I hate you."
            ]

# NLTK VADER

In [4]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# from nltk.corpus import twitter_samples
analyzer = SentimentIntensityAnalyzer()

df1 = pd.DataFrame({'sentence' : test_data})
df1['neg'] = df1['sentence'].apply(lambda x:analyzer.polarity_scores(x)['neg'])
df1['neu'] = df1['sentence'].apply(lambda x:analyzer.polarity_scores(x)['neu'])
df1['pos'] = df1['sentence'].apply(lambda x:analyzer.polarity_scores(x)['pos'])
df1['compound'] = df1['sentence'].apply(lambda x:analyzer.polarity_scores(x)['compound'])
df1['comp_score'] = df1['compound'].apply(lambda c: 'positive' if c >=0.05 else 'negative' if c <=-0.05 else 'neutral')
df1

Unnamed: 0,sentence,neg,neu,pos,compound,comp_score
0,Today is a good day!,0.0,0.484,0.516,0.4926,positive
1,My friend is a snake..,0.0,0.484,0.516,0.4939,positive
2,I'm so pissed,0.692,0.308,0.0,-0.6697,negative
3,Why did you do that?,0.0,1.0,0.0,0.0,neutral
4,This product was horrible and broke in 5 minutes,0.512,0.488,0.0,-0.743,negative
5,I love hot cocoa and sledding but the snow hur...,0.263,0.571,0.165,-0.3716,negative
6,OMG I want a puppy!,0.0,0.557,0.443,0.1511,positive
7,I wish I could be happier and spread kindness.,0.0,0.305,0.695,0.8442,positive
8,My favorite color is blue.,0.0,0.571,0.429,0.4588,positive
9,You're disgusting and mean and I hate you.,0.587,0.413,0.0,-0.7964,negative


In [5]:
df1[['sentence','comp_score']]

Unnamed: 0,sentence,comp_score
0,Today is a good day!,positive
1,My friend is a snake..,positive
2,I'm so pissed,negative
3,Why did you do that?,neutral
4,This product was horrible and broke in 5 minutes,negative
5,I love hot cocoa and sledding but the snow hur...,negative
6,OMG I want a puppy!,positive
7,I wish I could be happier and spread kindness.,positive
8,My favorite color is blue.,positive
9,You're disgusting and mean and I hate you.,negative


# BERT 

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

In [8]:
## Downloads pretrained finBERT model
tokenizer = AutoTokenizer.from_pretrained("ipuneetrathore/bert-base-cased-finetuned-finBERT")
model = AutoModelForSequenceClassification.from_pretrained("ipuneetrathore/bert-base-cased-finetuned-finBERT")

In [9]:
review_text = """Today is a good day!"""

In [10]:
MAX_LEN = 160
class_names = ['negative', 'neutral', 'positive']

def sentiment(review_text,MAX_LEN,class_names):
    encoded_new = tokenizer.encode_plus(
                            review_text,                      # Sentence to encode.
                            add_special_tokens = True,        # Add '[CLS]' and '[SEP]'
                            max_length = MAX_LEN,             # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,     # Construct attn. masks.
                            return_tensors = 'pt',            # Return pytorch tensors.
                       )

    # Add the encoded sentence to the list.    
    input_idst = (encoded_new['input_ids'])
    attention_maskst = (encoded_new['attention_mask'])

    # Convert the lists into tensors.
    input_idst = torch.cat([input_idst], dim=0)
    attention_maskst = torch.cat([attention_maskst], dim=0)


    new_test_output = model(input_idst, token_type_ids=None, 
                          attention_mask=attention_maskst)

    logits = new_test_output[0]
    predicted = logits.detach().numpy()

    # Store predictions
    flat_predictions = np.concatenate(predicted, axis=0)

    # For each sample, pick the label (0 or 1) with the higher score.
    new_predictions = np.argmax(flat_predictions).flatten()

    return class_names[new_predictions[0]]

result=[]
for i in test_data:
    x=sentiment(i,MAX_LEN,class_names)
    result.append(x)
    

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [11]:
df2 = pd.DataFrame({'sentences' : test_data})
df2["Result"] = result
df2

Unnamed: 0,sentences,Result
0,Today is a good day!,positive
1,My friend is a snake..,neutral
2,I'm so pissed,negative
3,Why did you do that?,neutral
4,This product was horrible and broke in 5 minutes,negative
5,I love hot cocoa and sledding but the snow hur...,neutral
6,OMG I want a puppy!,neutral
7,I wish I could be happier and spread kindness.,positive
8,My favorite color is blue.,neutral
9,You're disgusting and mean and I hate you.,negative


# SKLEARN RANDOM FOREST

In [13]:
import numpy as np 
import pandas as pd 
import re
import nltk 
import matplotlib.pyplot as plt
%matplotlib inline
data_source_url = "https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv"
airline_tweets = pd.read_csv(data_source_url)

In [14]:
airline_tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [15]:
# Preprocessing
features = airline_tweets.iloc[:, 10].values
labels = airline_tweets.iloc[:, 1].values

processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

test = pd.DataFrame(test_data)
features1 = test.iloc[:, 0].values

processed_features_test = []


for sentence in range(0, len(features1)):
    # Remove all the special characters
    processed_feature2 = re.sub(r'\W', ' ', str(features1[sentence]))

    # remove all single characters
    processed_feature2 = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature2)

    # Remove single characters from the start
    processed_feature2 = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature2) 

    # Substituting multiple spaces with single space
    processed_feature2 = re.sub(r'\s+', ' ', processed_feature2, flags=re.I)

    # Removing prefixed 'b'
    processed_feature2 = re.sub(r'^b\s+', '', processed_feature2)

    # Converting to Lowercase
    processed_feature2 = processed_feature2.lower()

    processed_features_test.append(processed_feature2)


# Need to ensure the final test data is preprocessed the same & same length
processed_features = np.append(processed_features,processed_features_test)

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()


In [16]:
print(processed_features.shape)
test_data_preprocessed = processed_features[-10:]
processed_features = processed_features[:-10]

(14650, 2302)


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)
import sys
test1=np.array(X_test)


In [18]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

predictions = text_classifier.predict(X_test)


In [19]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[1727  108   35]
 [ 332  240   42]
 [ 136   64  244]]
              precision    recall  f1-score   support

    negative       0.79      0.92      0.85      1870
     neutral       0.58      0.39      0.47       614
    positive       0.76      0.55      0.64       444

    accuracy                           0.76      2928
   macro avg       0.71      0.62      0.65      2928
weighted avg       0.74      0.76      0.74      2928

0.7551229508196722


In [20]:
predictions = text_classifier.predict(test_data_preprocessed)

In [21]:
df3 = pd.DataFrame({'sentence' : test_data})
df3["Result"] = predictions
df3


Unnamed: 0,sentence,Result
0,Today is a good day!,positive
1,My friend is a snake..,negative
2,I'm so pissed,negative
3,Why did you do that?,negative
4,This product was horrible and broke in 5 minutes,negative
5,I love hot cocoa and sledding but the snow hur...,positive
6,OMG I want a puppy!,negative
7,I wish I could be happier and spread kindness.,negative
8,My favorite color is blue.,negative
9,You're disgusting and mean and I hate you.,negative


# COMPARISON

In [22]:
comparison_table = pd.DataFrame({'sentence' : test_data})
comparison_table["VADER"] = df1[["comp_score"]]
comparison_table["BERT"] = df2[["Result"]]
comparison_table["SKLEARN"] = df3[["Result"]]
comparison_table

Unnamed: 0,sentence,VADER,BERT,SKLEARN
0,Today is a good day!,positive,positive,positive
1,My friend is a snake..,positive,neutral,negative
2,I'm so pissed,negative,negative,negative
3,Why did you do that?,neutral,neutral,negative
4,This product was horrible and broke in 5 minutes,negative,negative,negative
5,I love hot cocoa and sledding but the snow hur...,negative,neutral,positive
6,OMG I want a puppy!,positive,neutral,negative
7,I wish I could be happier and spread kindness.,positive,positive,negative
8,My favorite color is blue.,positive,neutral,negative
9,You're disgusting and mean and I hate you.,negative,negative,negative
