# Sentiment Classification on Sentihood data

## Comparison of various classical ML text classification algorithms and different word vectors with word-sentiment based methods (Vader and SentiWordNet)

## ML Classification Algorithms
        1. Naive Bayes
        2. Random Forest
        3. SVM
    
## Word Vectors
        1. TFIDF
        2. Word2Vector
        3. Glove

In [26]:
import os
import json
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from scipy import spatial

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import spacy
from spacy.tokenizer import Tokenizer

import gensim
from gensim.models import Word2Vec
import gensim.downloader as api

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

w2v_nlp = spacy.load('en_core_web_lg')
glove_model_name="glove-wiki-gigaword-100"
glove_model = api.load(glove_model_name)

tokenizer = Tokenizer(w2v_nlp.vocab)

# Data Preprocessing

In [27]:
data_dir = f"{os.environ['HOME']}/data/SentimentAnalysis/"

label_map = {
    "Neutral": 0,
    "Positive":1,
    "Negative":2,
}


def load_data(version):
    """
    preprocess data and create dataframe
    """
    data_file = os.path.join(data_dir, "sentihood", f"sentihood-{version}.json")
    json_data = json.load(open(data_file))
    
    data = []
    
    for jd in json_data:        
        # Get the label
        label = 0
        for op in jd["opinions"]:
            if op["aspect"] == "general":
                sentiment = op["sentiment"]
                label = label_map[sentiment]

        # Get the text on the side of the entity
        text = jd["text"]
        target_entity = op["target_entity"]
        text = text.replace(target_entity, "")
        data.append([text, target_entity, label])
    
    #Return data as dataframe
    df = pd.DataFrame.from_records(data, columns=["text", "entity", "label"])
    return df

In [28]:
# Load Data
df_train, df_test = load_data("train"), load_data("test")


In [29]:
df_test2 = df_test

####### Oversampling the negative cases

n = 3 # no. of times to oversample negative sentiments
df_test3 = df_test2.loc[df_test2["label"]==2,:]


for i in range(2):
    df_test4= df_test2.append(df_test3)


####### Undersampling the neutral cases

df_new = df_test4.loc[df_test4["label"]==0,:]
df_new_sel = pd.DataFrame()
for i in range(len(df_new)):
    if (i%2 == 0) or (i%2==1) :
        df_new_sel= df_new_sel.append(df_new.loc[df_new.index ==i,:])
            
            
df_test5= df_test4.drop(df_new_sel.index)          

df_test = df_test5

In [31]:
train_texts, y_train = df_train["text"], df_train["label"]
test_texts, y_test = df_test["text"], df_test["label"]

# Sentiment score using Vader

In [32]:
sid = SentimentIntensityAnalyzer()

df_test['vdscore'] = df_test['text'].apply(lambda x: sid.polarity_scores(x))
df_test['vdcmpd'] = df_test['vdscore'].apply(lambda x: np.sign(x['compound']))
df_test["vdcmpd"].replace(-1, 2, inplace = True)
print (df_test.head())

                                                 text     entity  label  \
1     All the neighborhoods around  are very nice ...  LOCATION1      1   
2           Cheap is , LOCATION1, but not really cool  LOCATION2      2   
3                                           Dont Try   LOCATION1      2   
11    I live in  and would really recommend it or ...  LOCATION2      1   
15    I only go to  to  IKEA - I find it depressin...  LOCATION1      2   

                                              vdscore  vdcmpd  
1   {'neg': 0.0, 'neu': 0.528, 'pos': 0.472, 'comp...     1.0  
2   {'neg': 0.316, 'neu': 0.684, 'pos': 0.0, 'comp...     2.0  
3   {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...     0.0  
11  {'neg': 0.0, 'neu': 0.798, 'pos': 0.202, 'comp...     1.0  
15  {'neg': 0.191, 'neu': 0.809, 'pos': 0.0, 'comp...     2.0  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [33]:
print (confusion_matrix(df_test["label"], df_test["vdcmpd"]))
print (classification_report(df_test["label"], df_test["vdcmpd"]))
print("Accuracy:",accuracy_score(df_test["label"], df_test["vdcmpd"]))

[[165 143  40]
 [ 69 285  14]
 [ 48  66 116]]
              precision    recall  f1-score   support

           0       0.59      0.47      0.52       348
           1       0.58      0.77      0.66       368
           2       0.68      0.50      0.58       230

    accuracy                           0.60       946
   macro avg       0.61      0.58      0.59       946
weighted avg       0.61      0.60      0.59       946

Accuracy: 0.5983086680761099


In [34]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

# Sentiment score using Sentiwordnet

In [35]:
lemmatizer = WordNetLemmatizer()

def sentiment_sentiwordnet(text):
    raw_sentences = sent_tokenize(text)
    sentiment = 0
    tokens_count = 0

    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))

        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue

            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue

            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue

            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            word_sent = swn_synset.pos_score() - swn_synset.neg_score()

            if word_sent != 0:
                sentiment += word_sent
                tokens_count += 1

    if tokens_count == 0:
        return 0
    sentiment = sentiment/tokens_count
    if sentiment >= 0.01:
        return 1
    if sentiment <= -0.01:
        return 2
    return 0

In [106]:
df_test['swscore'] = df_test['text'].apply(lambda x: sentiment_sentiwordnet(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
print (confusion_matrix(df_test["label"], df_test["swscore"]))
print (classification_report(df_test["label"], df_test["swscore"]))
print("Accuracy:",accuracy_score(df_test["label"], df_test["swscore"]))

[[121 128  99]
 [ 87 219  62]
 [ 38  70 122]]
              precision    recall  f1-score   support

           0       0.49      0.35      0.41       348
           1       0.53      0.60      0.56       368
           2       0.43      0.53      0.48       230

    accuracy                           0.49       946
   macro avg       0.48      0.49      0.48       946
weighted avg       0.49      0.49      0.48       946

Accuracy: 0.4883720930232558


# Text sentiment classification using word vectors

## Load training and test data

## 01. TFIDF

In [38]:
#Select NB or SVM classifier with tfidf

text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB())])
#text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC())])

In [39]:
# Tfidf training and predictions
text_clf.fit(train_texts, y_train)
y_pred = text_clf.predict(test_texts)

In [40]:
print (confusion_matrix(y_test, y_pred))
print (classification_report(y_test, y_pred))
print("Accuracy:",accuracy_score(y_test, y_pred))

[[347   1   0]
 [331  37   0]
 [222   8   0]]
              precision    recall  f1-score   support

           0       0.39      1.00      0.56       348
           1       0.80      0.10      0.18       368
           2       0.00      0.00      0.00       230

    accuracy                           0.41       946
   macro avg       0.40      0.37      0.24       946
weighted avg       0.45      0.41      0.27       946

Accuracy: 0.4059196617336152


  _warn_prf(average, modifier, msg_start, len(result))


## 02. Word2Vec and Glove

https://towardsdatascience.com/using-word2vec-to-analyze-news-headlines-and-predict-article-success-cdeda5f14751 

"Trained on enormous google news corpus"

Pretrained word2vec and glove models

https://github.com/RaRe-Technologies/gensim-data

In [58]:
class Data2Vector:
    def __init__(self, vec_emb):
        self.vec_emb = vec_emb
        
    def word_vector(self, word):
        
        if self.vec_emb == 'w2v':
            return w2v_nlp(str(word)).vector
        else:
            if word in glove_model:
                return glove_model[str(word)]
            else:
                return glove_model["unk"]


    def sent_vector(self, sent): 
        tokens = tokenizer(sent)
        if len(tokens) == 0:
            tokens = ["unk"]
        sent_vectors = [self.word_vector(token) for token in tokens]
        return np.average(np.array(sent_vectors), axis=0).tolist()
      
        
    def data_vector(self, data):
           
        sent_vec = []
        for i in range(len(data)):
            sent_vec.append(self.sent_vector(data[i]))
        return (sent_vec)
    

##  Model Selection

In [107]:
# Step 1 : Select word embedding

sel_vec_method = "w2v"
#sel_vec_method = "glove"


#Step 2: Select classification method

clf = LinearSVC()
#clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=0)


In [108]:
#Convert text to vectors
dv = Data2Vector(sel_vec_method)

train_texts_vec =dv.data_vector(list(train_texts))
test_texts_vec =dv.data_vector(list(test_texts))

##  Training-Test Data 

In [90]:
#Train the model using the training sets
clf.fit(train_texts_vec, y_train)

RandomForestClassifier(max_depth=20, n_estimators=200, random_state=0)

In [80]:
#Predict the response for test dataset
y_pred = clf.predict(test_texts_vec)


In [81]:
print (confusion_matrix(y_test, y_pred))
print (classification_report(y_test, y_pred))
print("Accuracy:",accuracy_score(y_test, y_pred))

[[348   0   0]
 [368   0   0]
 [230   0   0]]
              precision    recall  f1-score   support

           0       0.37      1.00      0.54       348
           1       0.00      0.00      0.00       368
           2       0.00      0.00      0.00       230

    accuracy                           0.37       946
   macro avg       0.12      0.33      0.18       946
weighted avg       0.14      0.37      0.20       946

Accuracy: 0.3678646934460888


  _warn_prf(average, modifier, msg_start, len(result))
