<a href="https://colab.research.google.com/github/houpingx/DataManagement/blob/main/Textual_Analysis_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview
In this code, we will introduce both topic modeling and sentiment analysis for textual analysis

# Sentiment Analysis
Sklearn is python package used a lot to build a classification or regression model. Next, we will show how to use different functions in Sklearn to build a classifier, labeling a review as positive (1) or negative (0). Moreover, we will generate the top positive and top negative words and visualize them.



### import packages

In [None]:
import sys
import re, numpy as np, pandas as pd
from pprint import pprint

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.patches import Rectangle # Sentence Coloring of N Sentences
from matplotlib.ticker import FuncFormatter

# NLTK Stop words
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
warnings.filterwarnings("ignore",category=DeprecationWarning)

import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

### import stopwords from NLTK

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

generate the stopword list

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

In [None]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Loading training and test data
In both training and test, there are two columns, reviews and label

In [None]:
training = pd.read_csv('https://raw.githubusercontent.com/houpingx/houpingx.github.io/main/DB/training.csv')
test = pd.read_csv('https://raw.githubusercontent.com/houpingx/houpingx.github.io/main/DB/test.csv')

In [None]:
print('Shape of the dataframe :')
print(training.shape)
print('Display the top 5 reviews :')
training.head(5)

Shape of the dataframe :
(10000, 2)
Display the top 5 reviews :


Unnamed: 0,review,label
0,The Fury of the Wolfman is a very good film th...,1
1,George Lopez is a funny man even without the s...,1
2,Antonioni was aiming for another hip masterpie...,1
3,My watch came a little too late but am glad i ...,1
4,This is part one of a short animation clip sho...,1


### Preprocess and clean the text data


1.   Tokenize each sentences, that is, breaking a sentence into a list of words. For instance, 'Everyone gets good grade in msa8040' --> ['everyone', 'get', 'good', 'grade', 'in', 'msa8040']
2.   Remove some unuseful words, punctuations, markers, numbers (maybe), etc.



In [None]:
def preprocess_reviews(reviews):
    reviews = [replace_no_space.sub("", line.lower()) for line in reviews]
    reviews = [replace_with_space.sub("",line) for line in reviews]
    
    return reviews

In [None]:
reviews_train = training.review
reviews_test = test.review

In [None]:
# clean & preprocess data
replace_no_space = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
replace_with_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

### Prepare data for the following expriments

In [None]:
cv = CountVectorizer(binary=True,stop_words='english',token_pattern=r'\b[^\d\W]+\b')
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

In [None]:
cv.get_feature_names()

### Randomly split training data into training and validation data sets

In [None]:
y = training.label
X_train, X_val, y_train, y_val = train_test_split(X,y,train_size=0.85)

### Logistic regression

In [None]:
for c in [0.01, 0.05, 0.1, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train,y_train)
    print ("Accuracy for C = %s: %s" % (c, accuracy_score(y_val, lr.predict(X_val))))
    

Accuracy for C = 0.01: 0.8613333333333333
Accuracy for C = 0.05: 0.8653333333333333
Accuracy for C = 0.1: 0.868
Accuracy for C = 0.5: 0.8613333333333333
Accuracy for C = 1: 0.8586666666666667


In [None]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X,y)
y_test = test.label
print ("Final Accuracy : %s" % accuracy_score(y_test,final_model.predict(X_test)))
confusion_matrix(y,final_model.predict(X_test))

Final Accuracy : 0.8663


array([[4306,  694],
       [ 643, 4357]])

In [None]:
feature_to_coef = {
    word: coef for word, coef in zip(
    cv.get_feature_names(),final_model.coef_[0])
}

for best_positive in sorted(feature_to_coef.items(),key=lambda x:x[1], reverse=True)[:10]:
    print(best_positive)



('excellent', 0.8843150220811236)
('great', 0.6672623393977196)
('perfect', 0.5800746271130811)
('best', 0.5476536253607572)
('amazing', 0.5453084713226355)
('favorite', 0.5376673008547482)
('loved', 0.5141225279095306)
('fantastic', 0.48647069116731184)
('liked', 0.46674090014156705)
('classic', 0.4497070380926154)


In [None]:
for best_negative in sorted(feature_to_coef.items(),key=lambda x:x[1])[:10]:
    print(best_negative)

('worst', -1.0893834304596768)
('waste', -0.9318368086862948)
('awful', -0.8878062513916525)
('boring', -0.729743973048694)
('bad', -0.7164301043526059)
('terrible', -0.7152473315698098)
('worse', -0.7045782778444624)
('dull', -0.6448239681844562)
('horrible', -0.6183440689565569)
('poor', -0.5934442165539343)


### SVC

In [None]:
for c in [0.01,0.05,0.1,0.15,0.2,0.5,1]:
    svm = SVC(C=c,kernel='linear')
    svm.fit(X_train,y_train)
    print ("Accuracy for C = %s: %s" % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C = 0.01: 0.8666666666666667
Accuracy for C = 0.05: 0.8553333333333333
Accuracy for C = 0.1: 0.846
Accuracy for C = 0.15: 0.842
Accuracy for C = 0.2: 0.844
Accuracy for C = 0.5: 0.8426666666666667
Accuracy for C = 1: 0.8413333333333334


In [None]:
svm = SVC(gamma='auto',C=0.01,kernel='linear')
svm.fit(X, y)
print ("Final Accuracy : %s" % accuracy_score(y,svm.predict(X_test)))

Final Accuracy : 0.8628


In [None]:
feature_to_coef = {
    word: coef for word, coef in zip(
    cv.get_feature_names(),svm.coef_[0].toarray()[0])
}

for best_positive in sorted(feature_to_coef.items(),key=lambda x:x[1], reverse=True)[:10]:
    print(best_positive)

('excellent', 0.44195162343950856)
('great', 0.34787313432528283)
('amazing', 0.290006797330905)
('liked', 0.285588501916759)
('best', 0.28524821053467336)
('loved', 0.25819930130977164)
('favorite', 0.25770514149735546)
('wonderful', 0.24813478054079768)
('perfect', 0.24186491070827645)
('enjoyed', 0.23848631928382613)




In [None]:
for best_negative in sorted(feature_to_coef.items(),key=lambda x:x[1])[:10]:
    print(best_negative)

('worst', -0.5460256017094325)
('awful', -0.4628432819094569)
('waste', -0.45486400983077635)
('boring', -0.3900085298764479)
('bad', -0.38103683187652304)
('terrible', -0.37934143158764694)
('worse', -0.3492930596398398)
('horrible', -0.3362260405828974)
('dull', -0.32366394908384327)
('poor', -0.3078765251275138)


### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
for k in [3,5,7,9,11,13,21]:
  knn = KNeighborsClassifier(n_neighbors=k)
  knn.fit(X_train, y_train)

  print ("Accuracy for C = %s: %s" % (k, accuracy_score(y_val, knn.predict(X_val))))

Accuracy for C = 1: 0.6213333333333333
Accuracy for C = 1: 0.6346666666666667
Accuracy for C = 1: 0.634
Accuracy for C = 1: 0.6473333333333333
Accuracy for C = 1: 0.6406666666666667
Accuracy for C = 1: 0.6473333333333333
Accuracy for C = 1: 0.6666666666666666


### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decisiontree = DecisionTreeClassifier()
decisiontree.fit(X_train, y_train)

print ("Accuracy: %s" % (accuracy_score(y_val, decisiontree.predict(X_val))))

Accuracy: 0.7193333333333334


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

randomforest = RandomForestRegressor(n_estimators=20, random_state=0)
randomforest.fit(X_train, y_train)

print ("Accuracy: %s" % (accuracy_score(y_val, (randomforest.predict(X_val)>=0.5).astype(float))))

Accuracy: 0.7673333333333333


### Naive Bayes Classifier using Gaussian

In [None]:
clf_nb = GaussianNB() 
clf_nb.fit(X_train.toarray(),y_train )
print ("Accuracy : %s" % accuracy_score(y_val,clf_nb.predict(X_val.toarray())))

Final Accuracy : 0.656


### Naive Bayes Classifier using Multinomail

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB().fit(X_train, y_train)
print ("Accuracy: %s" % (accuracy_score(y_val, mnb.predict(X_val))))

Accuracy: 0.8573333333333333


## Word2Vec


In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

model = Word2Vec(sentences=common_texts, size=100, window=5, min_count=1, workers=4)
model.save("word2vec_demo.model")



In [None]:
vector = model.wv['computer']  # numpy vector of a word

In [None]:
vector

array([-4.9745402e-04, -1.5121631e-03, -4.8474963e-03, -3.7636806e-03,
        4.2839019e-04,  2.3583020e-03,  4.1200826e-03, -6.1694393e-04,
       -4.2383433e-03,  2.7510205e-03,  2.8019424e-03, -1.6049416e-04,
        9.4172405e-04,  9.0856786e-04,  4.0287157e-03,  3.5053361e-03,
        4.8699509e-03,  4.2601642e-03, -3.2593368e-03,  3.9475478e-04,
        5.1716861e-04,  3.7459675e-03, -1.4451732e-03, -4.3675308e-03,
       -3.2184108e-03,  1.5215690e-03, -4.0394361e-03,  4.6502855e-03,
        1.7729661e-03, -4.7144685e-03, -4.9152635e-03, -3.3499426e-03,
       -3.8640359e-03, -4.7558285e-03,  1.9511918e-03,  4.3707672e-03,
       -1.0017244e-03,  2.0387250e-03,  2.6479647e-03, -4.8783282e-03,
       -4.4108373e-03, -2.2441465e-03,  1.2291233e-03, -1.1915270e-03,
        4.7032391e-03,  2.4244760e-03,  2.9362307e-04, -1.0944718e-03,
        3.6564886e-03,  7.1550667e-04,  1.9193446e-04, -1.3770919e-03,
        4.4503235e-03, -1.5059995e-03, -6.8018929e-04, -1.1205347e-03,
      

In [None]:
model.most_similar('computer')

[('response', 0.13790246844291687),
 ('trees', 0.135341078042984),
 ('interface', 0.13504242897033691),
 ('graph', 0.10856884717941284),
 ('minors', 0.08189325034618378),
 ('user', 0.0806637555360794),
 ('system', 0.04466661438345909),
 ('eps', 0.02010362595319748),
 ('survey', 0.017436539754271507),
 ('human', -0.04083963483572006)]

### Pretrained word2vec models

In [None]:
import gensim.downloader

In [None]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
glove_vectors = gensim.downloader.load('glove-twitter-25')



In [None]:
glove_vectors.most_similar('twitter')

[('facebook', 0.9480051398277283),
 ('tweet', 0.9403422474861145),
 ('fb', 0.9342358708381653),
 ('instagram', 0.9104823470115662),
 ('chat', 0.8964964747428894),
 ('hashtag', 0.8885936141014099),
 ('tweets', 0.8878157734870911),
 ('tl', 0.8778461813926697),
 ('link', 0.877821147441864),
 ('internet', 0.8753897547721863)]