<a href="https://colab.research.google.com/github/jnlinao/NLP/blob/main/Wk_6_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U textblob
!pip install vaderSentiment
!python -m textblob.download_corpora
#!python -m spacy download en_core_web_sm
!pip install tabulate
!pip install spacy

In [None]:
import spacy
import tabulate
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import path
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Textblob

TextBlob is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more.

https://textblob.readthedocs.io/en/dev/

In [None]:
tb = TextBlob("Apple is a great company.")
print(tb.sentiment)
print(tb.tags)

In [None]:
tb2 = TextBlob("The first season of this show was brilliant and meaningful drama. It had true intellectual depth to it and managed to more than once deliver a real surprise. There were interesting characters and some exceptional acting. But that first season really told all the story there was to tell. All the good story at least.")
print(tb2.sentiment)

In [None]:
tb2.tags

In [None]:
tb2.sentiment

# Fun with Spacy

### NER/Part of speech tagging

In [None]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
   print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

displacy.render(doc, style="dep")
displacy.render(doc, style="ent")

In [None]:
doc = nlp("Thuan Pham, hired as Uber’s chief technology officer by former CEO Travis Kalanick back in 2013, is leaving the company in three weeks, the ride-share giant revealed today in an SEC filing that came out just as The Information reported that massive layoffs at Uber are being proposed to preserve some of the company’s dwindling capital reserves.")
displacy.render(doc, style="ent")

# Vader 

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media. 

https://pypi.org/project/vaderSentiment/

In [None]:
vader = SentimentIntensityAnalyzer()
temp3 = vader.polarity_scores("Textblob is amazingly simple to use. What great fun!")

In [None]:
temp3

# IMDB Dataset Sentiment Analysis

In [None]:
# Load in the dataframe
df = pd.read_csv("IMDB_Dataset.csv")

In [None]:
df.axes

In [None]:
print(df.review[0], "\n", df.sentiment[0])

In [None]:
len(df)

In [None]:
df[0:10].head()

In [None]:
def detect_tb_polarity(text):
    return TextBlob(text).sentiment.polarity

def detect_tb_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def detect_vader_pos(text):
    return vader.polarity_scores(text)['pos']
    
def detect_vader_neg(text):
    return vader.polarity_scores(text)['neg']

def detect_vader_comp(text):
    return vader.polarity_scores(text)['compound']

In [None]:
vader.polarity_scores(df.review[0])

In [None]:
df_sample = df[0:1000].copy()
df_sample['tb_polarity'] = df_sample.review.apply(detect_tb_polarity)
df_sample['tb_subjectivity'] = df_sample.review.apply(detect_tb_subjectivity)

vader = SentimentIntensityAnalyzer()
df_sample['vader_pos'] = df_sample.review.apply(detect_vader_pos)
df_sample['vader_neg'] = df_sample.review.apply(detect_vader_neg)
df_sample['vader_comp'] = df_sample.review.apply(detect_vader_comp)

In [None]:
df_sample.head(25)

In [None]:
modelLR1 = LogisticRegression()

modelLR1.fit(df_sample.iloc[:,2:6], df_sample.iloc[:,1])

In [None]:
sent_pred = modelLR1.predict(df_sample.iloc[:,2:6])
confusion_matrix(df_sample.sentiment, sent_pred)
print("F1 score:", f1_score(df_sample.sentiment, sent_pred, average='micro'))

In [None]:
np.corrcoef(df_sample.vader_pos, df_sample.tb_polarity)[0,1]

In [None]:
sns.set(style="whitegrid")

In [None]:
ax = sns.violinplot(x="sentiment", y="tb_polarity", data=df_sample)

In [None]:
ax = sns.violinplot(x="sentiment", y="vader_comp", data=df_sample)

In [None]:
ax = sns.violinplot(x="sentiment", y="vader_pos", data=df_sample)

In [None]:
tb_pred = pd.cut(df_sample['tb_polarity'], bins=2, labels=["negative", "positive"])

In [None]:
print("Confusion Matrix:\n", confusion_matrix(df_sample.sentiment, tb_pred))
print("F1 score:", f1_score(df_sample.sentiment, tb_pred, average='micro'))

In [None]:
v_pred = np.where(df_sample['vader_comp'] > 0.0, "positive", "negative")

In [None]:
print("Confusion Matrix:\n", confusion_matrix(df_sample.sentiment, v_pred))
print("F1 score:", f1_score(df_sample.sentiment, v_pred, average='micro'))

## Datasets
Here, 70% of the original data are used for training models, and the rest are for test

In [None]:
df_sample = df[0:1000].copy()
train_sample = int(len(df_sample)*0.7)
train = df_sample[0:(train_sample)]
test = df_sample[(train_sample+1):len(df_sample)]
print('train data size:', len(train))
print('test data size:', len(test))

In [None]:
tv = TfidfVectorizer(stop_words='english', lowercase=True);
tv.fit(train.review)

In [None]:
modelNB = MultinomialNB(alpha=1)
modelNB.fit(tv.transform(train.review), train.sentiment)

In [None]:
nb_pred = modelNB.predict(tv.transform(test.review))

print("Confusion Matrix:\n", confusion_matrix(test.sentiment, nb_pred))
print("F1 score:", f1_score(test.sentiment, nb_pred, average='micro'))

In [None]:
modelLR = LogisticRegression(C=1, solver='liblinear')
modelLR.fit(tv.transform(train.review), train.sentiment)

lr_pred = modelLR.predict(tv.transform(test.review))

print("Confusion Matrix:\n", confusion_matrix(test.sentiment, lr_pred))
print("F1 score:", f1_score(test.sentiment, lr_pred, average='micro'))

## Understandig emotions

In [None]:
# Load in the dataframe
df_emotions = pd.read_csv("emotions.csv")

In [None]:
df_emotions.head(25)

In [None]:
df_emotions.sentiment.unique()

In [None]:
len(df_emotions)

In [None]:
# Create the test and training sets

train_samples = int(len(df_emotions)*0.8)

train = df_emotions[0:train_samples]
test = df_emotions[train_samples+1:len(df_emotions)]
print('train data size:', len(train))
print('test data size:', len(test))

In [None]:
#Some descriptive analysis
train['sentiment'].value_counts().plot(kind='bar')

In [None]:
#Some descriptive analysis
test['sentiment'].value_counts().plot(kind='bar')

In [None]:
tv = TfidfVectorizer(ngram_range=(1,3), stop_words='english')
tv.fit(train.content)

#modelELR = LogisticRegression(C=0.1)
modelELR = MultinomialNB(alpha=0.1)
modelELR.fit(tv.transform(train.content), train.sentiment)


In [None]:
elr_pred = modelELR.predict(tv.transform(train.content))

print("Confusion Matrix:\n", confusion_matrix(train.sentiment, elr_pred))
print("F1 score:", f1_score(train.sentiment, elr_pred, average='micro'))

In [None]:
print(train.content[20])
modelELR.predict(tv.transform(train.content[20:21]))

In [None]:
print(train.content[40])
modelELR.predict(tv.transform(train.content[40:41]))

In [None]:
def plot_confusion_matrix(preds, labels):
  class_labels = np.unique(df_emotions.sentiment)
  class_size = len(class_labels)
  cnf_mat = confusion_matrix(labels, preds)                            #Computes confusion_matrix
  cnf_mat = cnf_mat.astype('float') / (cnf_mat.sum(axis=1)[:, np.newaxis]+1)
  plt.imshow(cnf_mat, interpolation='nearest', cmap=plt.cm.Blues)
  #plt.xticks(np.arange(class_size), np.arange(1, class_size + 1), class_labels)
  plt.xticks(np.arange(class_size), labels=class_labels, rotation='vertical')
  plt.yticks(np.arange(class_size), labels=class_labels)
  #plt.yticks(np.arange(class_size), np.arange(1, class_size + 1), class_labels)
  plt.title('Confusion matrix of the classifier')
  plt.xlabel('True Label')
  plt.ylabel('Predicted Label')
  plt.title('Confusion Matrix')
  plt.colorbar()
  plt.show()

In [None]:
plot_confusion_matrix(train.sentiment, elr_pred)

In [None]:
elr_pred = modelELR.predict(tv.transform(test.content))

print("Confusion Matrix:\n", confusion_matrix(test.sentiment, elr_pred))
print("F1 score:", f1_score(test.sentiment, elr_pred, average='micro'))

In [None]:
plot_confusion_matrix(test.sentiment, elr_pred)