# Sentiment Analysis using ML on Labelled Data

In [1]:
#Importing the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools  

In [4]:
#Loading the dataset
df=pd.read_csv('reviews.csv')

In [5]:
df.head()

Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I...",5
2,2,like!Prof and TAs are helpful and the discussi...,5
3,3,Easy to follow and includes a lot basic and im...,5
4,4,Really nice teacher!I could got the point eazl...,4


In [6]:
#Deleting unnecessaty column
df.drop('Id',axis=1,inplace=True)

In [7]:
#Dropping the empty rows
df.dropna(inplace=True)
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
df.drop(blanks, inplace=True)

In [8]:
#Loading the libraries required for word and sentence tokenization
from nltk.tokenize import sent_tokenize, word_tokenize

In [9]:
#Storing the words in dictionairy after tokenizing
dict1={}
for i in range(len(df['Review'])):
    word=word_tokenize(df['Review'][i])
    dict1[i]=word

In [10]:
doc=[]
for i in range(len(df['Review'])):
    doc.append(dict1[i])

In [11]:
#Lemmatizing the text
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [13]:
#Removing the stop words
from nltk import pos_tag
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [14]:
#Applying all cleaning to text
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [15]:
documents = [(clean_review(document)) for document in doc]

In [16]:
#Storing the clean text
for i in range(len(df['Review'])):
    df['Review'][i]=" ".join(documents[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
df.to_csv('Course_Reviews.csv')

In [2]:
df=pd.read_csv('Course_Reviews.csv')

In [3]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
df.isna().sum()

Review    12
Label      0
dtype: int64

In [5]:
#Dropping the empty rows
df.dropna(inplace=True)
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
df.drop(blanks, inplace=True)

In [6]:
df.shape

(107006, 2)

In [7]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X=vec.fit_transform(df['Review'])
Y=df['Label']

In [8]:
#Handling the Imbalanced data - Oversampling
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=12)
X_res,y_res=sm.fit_sample(X,Y)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
unique,count=np.unique(y_res,return_counts=True)
y_smote_value_count={k:v for (k,v) in zip(unique,count)}
y_smote_value_count

{1: 79163, 2: 79163, 3: 79163, 4: 79163, 5: 79163}

In [10]:
X.shape

(107006, 29691)

In [11]:
#Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33, random_state=42)

In [28]:
#Implementing the classification model for prediction
from sklearn.ensemble import RandomForestClassifier
clf2=RandomForestClassifier()
clf2.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [46]:
#Model Evaluation
from sklearn.metrics import classification_report,confusion_matrix
predictions = clf2.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           1       0.98      0.99      0.99     26155
           2       0.99      0.99      0.99     26203
           3       0.98      0.98      0.98     26097
           4       0.85      0.89      0.87     26165
           5       0.89      0.84      0.86     25999

    accuracy                           0.94    130619
   macro avg       0.94      0.94      0.94    130619
weighted avg       0.94      0.94      0.94    130619



In [59]:
print(confusion_matrix(y_test,predictions))

[[25904    46    15   171    19]
 [   69 25990    59    68    17]
 [   27    86 25529   336   119]
 [  124    11   176 23250  2604]
 [  216    32   353  3526 21872]]


In [13]:
import pickle
#filename = 'finalized_model.sav'
#pickle.dump(clf2, open(filename, 'wb'))
 
# load the model from disk
clf2 = pickle.load(open(filename, 'rb'))

In [15]:
from sklearn.metrics import classification_report,confusion_matrix
predictions = clf2.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           1       0.93      0.37      0.53     26155
           2       0.99      0.35      0.51     26203
           3       0.94      0.38      0.54     26097
           4       0.72      0.33      0.46     26165
           5       0.29      1.00      0.45     25999

    accuracy                           0.48    130619
   macro avg       0.78      0.48      0.50    130619
weighted avg       0.78      0.48      0.50    130619



# Sentiment Analysis without Label

In [30]:
#Loading the dataset
df=pd.read_csv('reviews.csv')

In [31]:
#Dropping the useless columns
df.drop('Id',axis=1,inplace=True)

#Dropping the empty rows
df.dropna(inplace=True)
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
df.drop(blanks, inplace=True)

In [32]:
#Converting the label into text
for i in range(len(df)):
    if(df['Label'][i]==5):
        df['Label'][i]='Very Positive'
    elif(df['Label'][i]==4):
        df['Label'][i]='Positive'
    elif(df['Label'][i]==3):
        df['Label'][i]='Neutral'
    elif(df['Label'][i]==2):
        df['Label'][i]='Negative'
    elif(df['Label'][i]==1):
        df['Label'][i]='Very Negative'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [33]:
#Implementing the VADER for sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [34]:
#Getting the sentiment score of each review
df['scores'] = df['Review'].apply(lambda review: sid.polarity_scores(review))

In [35]:
#Getting the scores of each section
df['positive'] = df['scores'].apply(lambda score_dict: score_dict['pos'])
df['negative'] = df['scores'].apply(lambda score_dict: score_dict['neg'])
df['neutral'] = df['scores'].apply(lambda score_dict: score_dict['neu'])
df['compound']  =df['scores'].apply(lambda score_dict: score_dict['compound'])

In [36]:
df.head()

Unnamed: 0,Review,Label,scores,positive,negative,neutral,compound
0,good and interesting,Very Positive,"{'neg': 0.0, 'neu': 0.152, 'pos': 0.848, 'comp...",0.848,0.0,0.152,0.6808
1,"This class is very helpful to me. Currently, I...",Very Positive,"{'neg': 0.0, 'neu': 0.866, 'pos': 0.134, 'comp...",0.134,0.0,0.866,0.4754
2,like!Prof and TAs are helpful and the discussi...,Very Positive,"{'neg': 0.0, 'neu': 0.575, 'pos': 0.425, 'comp...",0.425,0.0,0.575,0.8843
3,Easy to follow and includes a lot basic and im...,Very Positive,"{'neg': 0.0, 'neu': 0.701, 'pos': 0.299, 'comp...",0.299,0.0,0.701,0.5719
4,Really nice teacher!I could got the point eazl...,Positive,"{'neg': 0.0, 'neu': 0.794, 'pos': 0.206, 'comp...",0.206,0.0,0.794,0.3266
