# Machine Learning Algorithms with SentiHood data
 1. Sentiment Analysis using only Text

 2. Sentiment Analysis using Aspect and Text together

 3. Sentiment Analysis using Text with one-hot encoding

  


## Importing Libraries

In [1]:
import json
from tqdm import tqdm
import pandas as pd

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize  
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from collections import Counter
from sklearn.preprocessing import LabelEncoder
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.metrics import  confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from nltk.stem import PorterStemmer
from gensim.models import word2vec
from sklearn.preprocessing import OneHotEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)


Mounted at /content/drive


## Processing Train dataSet

In [4]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-train.json', 'r') as fp:
    training_set = json.load(fp)

In [5]:
locations = ['LOCATION1']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:

  df = pd.DataFrame({'id': [], 'text': [],'aspect':[], 'sentiment': []})

  ii = 0
  for each_example in training_set:
      id = str(int(each_example['id']))
      text = each_example['text'].strip()
      
      # If `location` is present in the text, only then iterate over the  
      # list of opinions to find suitable `location-aspect` datapoints.

      if location in text:
          aspect_found = False
          
          for opinion in each_example['opinions']:
              # Checking if the current example contains a sentiment
              # related to `location-aspect`
              
              if opinion['target_entity'] == location:
                  df.loc[ii] = [id, text,opinion['aspect'], opinion['sentiment']]
                  aspect_found = True
                  ii += 1
                  break
          
          # If no sentiment is found for `location-asppect` in current 
          # example, then add a datapoint with None.
          
          if not aspect_found:
              df.loc[ii] = [id, text,opinion['aspect'], 'None']
              ii += 1
        

In [6]:
df.head(5)

Unnamed: 0,id,text,aspect,sentiment
0,1430,LOCATION1 is transforming and the prices will ...,price,Negative
1,2013,Along LOCATION1 there are lots of Electronics ...,shopping,Positive
2,1244,And LOCATION1 is ten mins direct on the tube t...,transit-location,Positive
3,209,Another option is LOCATION1 which is very cent...,nightlife,Positive
4,2824,Best bet is around LOCATION2 and LOCATION1 are...,general,Positive


In [7]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  after removing the cwd from sys.path.


In [8]:
df.head(20)

Unnamed: 0,id,text,aspect,sentiment
0,1430,LOCATION1 is transforming and the prices will go up and up,price,Negative
1,2013,Along LOCATION1 there are lots of Electronics shops (independent ones),shopping,Positive
2,1244,And LOCATION1 is ten mins direct on the tube to LOCATION2:,transit-location,Positive
3,209,Another option is LOCATION1 which is very central and has tons of clubs/bars within walking distance of each other,nightlife,Positive
4,2824,Best bet is around LOCATION2 and LOCATION1 area in the northwest corner really nice,general,Positive
5,1835,Central London based taxis mostly refuse fares to suburban areas like LOCATION1,transit-location,Negative
6,1429,Don't go looking at places like LOCATION1 now that IS a rough area you really wouldn't want to live in,general,Negative
7,1404,"Down here in South London the accent [local] tends to be a bit more Kent, while the accent [local] in LOCATION1 tends to be a bit more Essex and Hertfordshire",general,
8,190,Everyone in LOCATION1 is now black or Bangladeshi,multicultural,Positive
9,1281,For gods sake don't move to LOCATION1 its horrible,general,Negative


In [9]:
print(Counter(df['sentiment']))
print(Counter(df['aspect']))

Counter({'Positive': 1486, 'None': 956, 'Negative': 535})
Counter({'general': 1226, 'price': 359, 'transit-location': 351, 'safety': 272, 'live': 209, 'shopping': 124, 'nightlife': 115, 'multicultural': 102, 'green-nature': 75, 'touristy': 54, 'dining': 49, 'quiet': 41})


## Processing Validation data-set

In [10]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-dev.json', 'r') as fp:
    validation_set = json.load(fp)

In [11]:
locations = ['LOCATION1']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:

  df_valid = pd.DataFrame({'id': [], 'text': [],'aspect':[], 'sentiment': []})

  ii = 0
  for each_example in validation_set:
      id = str(int(each_example['id']))
      text = each_example['text'].strip()
      
      # If `location` is present in the text, only then iterate over the  
      # list of opinions to find suitable `location-aspect` datapoints.

      if location in text:
          aspect_found = False
          
          for opinion in each_example['opinions']:
              # Checking if the current example contains a sentiment
              # related to `location-aspect`
              
              if opinion['target_entity'] == location:
                  df_valid.loc[ii] = [id, text,opinion['aspect'], opinion['sentiment']]
                  aspect_found = True
                  ii += 1
                  break
          
          # If no sentiment is found for `location-asppect` in current 
          # example, then add a datapoint with None.
          
          if not aspect_found:
              df_valid.loc[ii] = [id, text,opinion['aspect'], 'None']
              ii += 1
        

        # df.to_csv('/content/drive/My Drive/SentiHood/Bert-single/TrainingData/' + str(location) + str(aspect) + '.csv', index=False)
        # print(f"{location}{aspect} DONE!\tLength = {ii}")

In [12]:
df_valid.head()

Unnamed: 0,id,text,aspect,sentiment
0,302,LOCATION1 is just a normal area that happens to have an alternative market,shopping,Positive
1,460,""" My mate then went on to ask: ""Well, isn't LOCATION2 considered PART of LOCATION1",shopping,
2,582,"""I'm from LOCATION1 so I'm hard""",shopping,
3,465,'Bo-bos' - bourgeois bohemians - are particularly prevalent in these parts (these are the British equivalent of American Trustafarians - young boys and girls who have a rich mummy and daddy but who pretend to be poor) Its in the suburbs of London in LOCATION1,shopping,
4,270,"( I was born n maternity hospital in Clapton ) Lived in LOCATION2 and LOCATION1 , Not now though thanks",shopping,


In [13]:
df_valid.shape

(747, 4)

In [14]:
print(Counter(df_valid['sentiment']))
print(Counter(df_valid['aspect']))

Counter({'Positive': 374, 'None': 242, 'Negative': 131})
Counter({'general': 293, 'transit-location': 98, 'price': 84, 'safety': 83, 'live': 58, 'shopping': 37, 'nightlife': 21, 'multicultural': 21, 'dining': 15, 'green-nature': 14, 'quiet': 13, 'touristy': 10})


## Processing Test data-set 

In [15]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-test.json', 'r') as fp:
    testing_set = json.load(fp)  

In [16]:
locations = ['LOCATION1']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:

  df_test = pd.DataFrame({'id': [], 'text': [],'aspect':[], 'sentiment': []})

  ii = 0
  for each_example in testing_set:
      id = str(int(each_example['id']))
      text = each_example['text'].strip()
      
      # If `location` is present in the text, only then iterate over the  
      # list of opinions to find suitable `location-aspect` datapoints.

      if location in text:
          aspect_found = False
          
          for opinion in each_example['opinions']:
              # Checking if the current example contains a sentiment
              # related to `location-aspect`
              
              if opinion['target_entity'] == location:
                  df_test.loc[ii] = [id, text,opinion['aspect'], opinion['sentiment']]
                  aspect_found = True
                  ii += 1
                  break
          
          # If no sentiment is found for `location-asppect` in current 
          # example, then add a datapoint with None.
          
          if not aspect_found:
              df_test.loc[ii] = [id, text,opinion['aspect'], 'None']
              ii += 1

In [17]:
print(Counter(df_test['sentiment']))
print(Counter(df_test['aspect']))

Counter({'Positive': 719, 'None': 491, 'Negative': 281})
Counter({'general': 628, 'safety': 179, 'transit-location': 177, 'price': 146, 'live': 100, 'nightlife': 55, 'multicultural': 50, 'shopping': 49, 'green-nature': 39, 'touristy': 29, 'dining': 22, 'quiet': 17})


## Data Cleaning

In [18]:
# Combining training, validation and testing data into final_data
data=[df,df_valid,df_test]
Final_data=pd.concat(data)    

In [19]:
Final_data.shape

(5215, 4)

In [20]:
from google.colab import files
df.to_csv('Final_data.csv') 
files.download('Final_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
# Changing aspect to labels
label_encoder=LabelEncoder()
integer_category = label_encoder.fit_transform(Final_data.aspect)
Final_data['label_aspect']=pd.DataFrame(integer_category)

# Changing sentiment to numerical label
Final_data['sentiment']=Final_data['sentiment'].map({'None':0,'Negative':1,'Positive':2})

In [22]:
Final_data.head()

Unnamed: 0,id,text,aspect,sentiment,label_aspect
0,1430,LOCATION1 is transforming and the prices will go up and up,price,1,6
1,2013,Along LOCATION1 there are lots of Electronics shops (independent ones),shopping,2,9
2,1244,And LOCATION1 is ten mins direct on the tube to LOCATION2:,transit-location,2,11
3,209,Another option is LOCATION1 which is very central and has tons of clubs/bars within walking distance of each other,nightlife,2,5
4,2824,Best bet is around LOCATION2 and LOCATION1 area in the northwest corner really nice,general,2,1


In [23]:
# Combining aspect and sentence in train data
Final_data['combined text']=Final_data['text']+str(" ")+Final_data['aspect']

## Model building

In [24]:
# Splitting the final_data into train and test
X,y= train_test_split(Final_data, test_size=0.25, random_state=70)

In [25]:
X.head()

Unnamed: 0,id,text,aspect,sentiment,label_aspect,combined text
1551,2418,"LOCATION1 has much character - However , crime there is quite high",general,2,1,"LOCATION1 has much character - However , crime there is quite high general"
1435,454,i live in LOCATION2 and have been to LOCATION1 before,general,0,1,i live in LOCATION2 and have been to LOCATION1 before general
339,740,"LOCATION1 is a bit of a dump, LOCATION2 is slightly nicer",general,1,11,"LOCATION1 is a bit of a dump, LOCATION2 is slightly nicer general"
734,1044,For example; I live in LOCATION1 but I always say LOCATION2,multicultural,0,4,For example; I live in LOCATION1 but I always say LOCATION2 multicultural
898,560,I didn't even know there was a north LOCATION1 depends which bit,general,0,1,I didn't even know there was a north LOCATION1 depends which bit general


In [26]:
y.shape

(1304, 6)

In [27]:
X.head()

Unnamed: 0,id,text,aspect,sentiment,label_aspect,combined text
1551,2418,"LOCATION1 has much character - However , crime there is quite high",general,2,1,"LOCATION1 has much character - However , crime there is quite high general"
1435,454,i live in LOCATION2 and have been to LOCATION1 before,general,0,1,i live in LOCATION2 and have been to LOCATION1 before general
339,740,"LOCATION1 is a bit of a dump, LOCATION2 is slightly nicer",general,1,11,"LOCATION1 is a bit of a dump, LOCATION2 is slightly nicer general"
734,1044,For example; I live in LOCATION1 but I always say LOCATION2,multicultural,0,4,For example; I live in LOCATION1 but I always say LOCATION2 multicultural
898,560,I didn't even know there was a north LOCATION1 depends which bit,general,0,1,I didn't even know there was a north LOCATION1 depends which bit general


### Machine Learning models using only TEXT

#### i. Naive Bayes 


In [28]:
# applying tf-idf
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X['text'])
y_tfidf=tfidf.transform(y['text'])

In [29]:
# applying Naive Bayes Model 
MNB = MultinomialNB()
MNB_sen=MNB.fit(X_tfidf,X['sentiment'])


In [30]:
# def MNB_sentiment():
predicted_train = MNB_sen.predict(X_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['sentiment'])))
predicted_test= MNB_sen.predict(y_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['sentiment'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['sentiment']))


The accuracy score of train data 0.702889286627461
The accuracy score of test data 0.6211656441717791
------------test data-------
[[159   8   9]
 [  0  13   0]
 [279 198 638]]


In [31]:
print(Counter(predicted_test))
print(Counter(y['sentiment']))

Counter({2: 1115, 0: 176, 1: 13})
Counter({2: 647, 0: 438, 1: 219})


In [32]:
#printing output results for train and test data
MNB_asp=MNB.fit(X_tfidf,X['label_aspect'])
predicted_train_asp = MNB_asp.predict(X_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train_asp, X['label_aspect'])))
predicted_test_asp= MNB_asp.predict(y_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test_asp, y['label_aspect'])))
print("------------test data-------")
print(confusion_matrix(predicted_test_asp,y['label_aspect']))

The accuracy score of train data 0.41012528765021733
The accuracy score of test data 0.4087423312883436
------------test data-------
[[  0   0   0   0   0   0   0   0   0   0   0   0]
 [ 12 533  27 104  45  39 166  21 131  46  22 157]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   1   0   0   0   0   0   0   0   0   0   0]]


In [33]:
print(Counter(predicted_test_asp))
print(Counter(y['label_aspect']))

Counter({1: 1303, 11: 1})
Counter({1: 534, 6: 166, 11: 157, 8: 131, 3: 104, 9: 46, 4: 45, 5: 39, 2: 27, 10: 22, 7: 21, 0: 12})


#### ii. Logistic Regression 


In [34]:
# applying tf-idf
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X['text'])
y_tfidf=tfidf.transform(y['text'])

In [35]:
# Applying Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_tfidf,X['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
#printing output results for train and test data
predicted_train = logreg.predict(X_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['sentiment'])))
predicted_test= logreg.predict(y_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['sentiment'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['sentiment']))

The accuracy score of train data 0.8491434415750447
The accuracy score of test data 0.7323619631901841
------------test data-------
[[317  36  61]
 [ 14  91  39]
 [107  92 547]]


In [37]:
print(Counter(predicted_test))
print(Counter(y['sentiment']))

Counter({2: 746, 0: 414, 1: 144})
Counter({2: 647, 0: 438, 1: 219})


#### iii.SVM  

In [38]:
# applying tf-idf
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X['text'])
y_tfidf=tfidf.transform(y['text'])

In [39]:
#applying SVM model
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(X_tfidf,X['sentiment'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [40]:
# def SVMclassifier_sentiment():
predicted_train = classifier_linear.predict(X_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['sentiment'])))
predicted_test= classifier_linear.predict(y_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['sentiment'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['sentiment']))

The accuracy score of train data 0.8716440807977499
The accuracy score of test data 0.7315950920245399
------------test data-------
[[322  31  66]
 [ 14  97  46]
 [102  91 535]]


In [41]:
print(Counter(predicted_test))
print(Counter(y['sentiment']))

Counter({2: 728, 0: 419, 1: 157})
Counter({2: 647, 0: 438, 1: 219})


### Machine Learning models using aspect and Text together

#### i. Naive Bayes

In [None]:
# applying tf-idf
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X['combined text'])
y_tfidf=tfidf.transform(y['combined text'])

In [None]:
MNB = MultinomialNB()
MNB_sen=MNB.fit(X_tfidf,X['sentiment'])

In [None]:
# def MNB_sentiment():
predicted_train = MNB_sen.predict(X_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['sentiment'])))
predicted_test= MNB_sen.predict(y_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['sentiment'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['sentiment']))

The accuracy score of train data 0.6957299923293275
The accuracy score of test data 0.61579754601227
------------test data-------
[[152   9   8]
 [  0  12   0]
 [286 198 639]]


In [None]:
print(Counter(predicted_test))
print(Counter(y['sentiment']))

Counter({2: 1123, 0: 169, 1: 12})
Counter({2: 647, 0: 438, 1: 219})


In [None]:
#printing results for test and train
MNB_asp=MNB.fit(X_tfidf,X['label_aspect'])
predicted_train_asp = MNB_asp.predict(X_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train_asp, X['label_aspect'])))
predicted_test_asp= MNB_asp.predict(y_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test_asp, y['label_aspect'])))
print("------------test data-------")
print(confusion_matrix(predicted_test_asp,y['label_aspect']))

The accuracy score of train data 0.42060854001534137
The accuracy score of test data 0.41180981595092025
------------test data-------
[[  0   0   0   0   0   0   0   0   0   0   0   0]
 [ 12 531  27 103  45  39 165  21 131  46  22 152]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   1   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   3   0   1   0   0   0   0   0   0   0   5]]


In [None]:
print(Counter(predicted_test_asp))
print(Counter(y['label_aspect']))

Counter({1: 1294, 11: 9, 6: 1})
Counter({1: 534, 6: 166, 11: 157, 8: 131, 3: 104, 9: 46, 4: 45, 5: 39, 2: 27, 10: 22, 7: 21, 0: 12})


#### ii.Logistic Regression

In [None]:
#logistic regression model
logreg = LogisticRegression()
logreg.fit(X_tfidf,X['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# def log_sentiment():
predicted_train = logreg.predict(X_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['sentiment'])))
predicted_test= logreg.predict(y_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['sentiment'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['sentiment']))

The accuracy score of train data 0.8563027358731782
The accuracy score of test data 0.75
------------test data-------
[[309  41  59]
 [ 19 109  28]
 [110  69 560]]


In [None]:
print(Counter(predicted_test))
print(Counter(y['sentiment']))

Counter({2: 739, 0: 409, 1: 156})
Counter({2: 647, 0: 438, 1: 219})


In [None]:
#printing results of train and test
MNB_asp=logreg.fit(X_tfidf,X['label_aspect'])
predicted_train_asp = MNB_asp.predict(X_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train_asp, X['label_aspect'])))
predicted_test_asp= MNB_asp.predict(y_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test_asp, y['label_aspect'])))
print("------------test data-------")
print(confusion_matrix(predicted_test_asp,y['label_aspect']))

The accuracy score of train data 0.6977755049859371
The accuracy score of test data 0.5866564417177914
------------test data-------
[[  3   1   0   0   0   0   0   0   0   0   0   0]
 [  4 449  14  53  21  18  66  13  52  21  16  64]
 [  0   2   9   3   0   0   0   0   1   0   0   1]
 [  0  11   0  35   1   1   3   1   2   1   0   2]
 [  0   0   0   1  16   1   0   0   2   1   0   0]
 [  0   4   0   1   0  13   1   0   3   1   0   1]
 [  0  21   2   3   2   3  84   1   6   1   0   5]
 [  0   1   0   0   0   0   0   1   1   0   0   0]
 [  1  16   1   1   3   1   6   2  58   0   0   6]
 [  1   1   1   0   0   2   1   1   2  18   1   3]
 [  0   0   0   2   0   0   0   0   0   0   4   0]
 [  3  28   0   5   2   0   5   2   4   3   1  75]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
print(Counter(predicted_test_asp))
print(Counter(y['label_aspect']))

Counter({1: 791, 11: 128, 6: 128, 8: 95, 3: 57, 9: 31, 5: 24, 4: 21, 2: 16, 10: 6, 0: 4, 7: 3})
Counter({1: 534, 6: 166, 11: 157, 8: 131, 3: 104, 9: 46, 4: 45, 5: 39, 2: 27, 10: 22, 7: 21, 0: 12})


#### iii. SVM

In [None]:
# SVM model
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(X_tfidf,X['sentiment'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
# def MNB_sentiment():
predicted_train = classifier_linear.predict(X_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['sentiment'])))
predicted_test= classifier_linear.predict(y_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['sentiment'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['sentiment']))

The accuracy score of train data 0.8849399130657121
The accuracy score of test data 0.7553680981595092
------------test data-------
[[314  35  57]
 [ 16 112  31]
 [108  72 559]]


In [None]:
print(Counter(predicted_test))
print(Counter(y['sentiment']))

Counter({2: 739, 0: 406, 1: 159})
Counter({2: 647, 0: 438, 1: 219})


In [None]:
# printing results for train and test 
MNB_asp=classifier_linear.fit(X_tfidf,X['label_aspect'])
predicted_train_asp = MNB_asp.predict(X_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train_asp, X['label_aspect'])))
predicted_test_asp= MNB_asp.predict(y_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test_asp, y['label_aspect'])))
print("------------test data-------")
print(confusion_matrix(predicted_test_asp,y['label_aspect']))

The accuracy score of train data 0.7266683712605472
The accuracy score of test data 0.6219325153374233
------------test data-------
[[  5   1   0   0   1   0   1   0   0   0   0   2]
 [  3 422   4  47  10  12  50   7  39  15   6  47]
 [  0   3  18   3   0   0   0   0   1   0   0   2]
 [  0  11   0  36   1   1   4   0   2   1   0   3]
 [  0   3   0   1  26   1   1   0   2   1   1   1]
 [  0   7   0   2   0  20   3   0   3   0   0   2]
 [  0  22   2   4   1   2  87   1   6   1   0   5]
 [  0   5   0   0   2   0   3   7   2   1   0   0]
 [  1  23   1   3   2   1   8   2  70   1   0   6]
 [  0   3   1   1   0   2   3   1   2  23   1   4]
 [  2   2   0   2   0   0   0   0   0   0  13   1]
 [  1  32   1   5   2   0   6   3   4   3   1  84]]


In [None]:
print(Counter(predicted_test_asp))
print(Counter(y['label_aspect']))

Counter({1: 662, 11: 142, 6: 131, 8: 118, 3: 59, 9: 41, 4: 37, 5: 37, 2: 27, 10: 20, 7: 20, 0: 10})
Counter({1: 534, 6: 166, 11: 157, 8: 131, 3: 104, 9: 46, 4: 45, 5: 39, 2: 27, 10: 22, 7: 21, 0: 12})


### One-hot encoding

In [None]:
from scipy.sparse import coo_matrix, hstack
from scipy import stats
from scipy.stats import mannwhitneyu

In [None]:
# define one hot encoding
X_encoder = pd.get_dummies(X['aspect'])
Y_encoder=pd.get_dummies(y['aspect'])
# # transform data
# onehot = encoder.fit_transform(X['aspect'])

In [None]:
X_encoder.shape

(3911, 12)

In [None]:
Y_encoder.shape

(1304, 12)

In [None]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X['text'])
y_tfidf=tfidf.transform(y['text'])

In [None]:
# tfidf = TfidfVectorizer()
# X_onehot_tfidf = tfidf.fit_transform(X_encoder)
# Y_onehot_tfidf=tfidf.fit_transform(Y_encoder)

In [None]:
# enc = OneHotEncoder(handle_unknown='ignore')
# Xone_encoder = enc.get_dummies(X['aspect'])
# Yone_encoder=pd.get_dummies(y['aspect'])

In [None]:
trainX_tfidf = hstack([X_tfidf, X_encoder])
testX_tfidf = hstack([y_tfidf, Y_encoder])

#### i. Naive bayes

In [None]:
#Naive Bayes model
MNB = MultinomialNB()
MNB_sen=MNB.fit(trainX_tfidf,X['sentiment'])

In [None]:
# def MNB_sentiment():
predicted_train = MNB_sen.predict(trainX_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['sentiment'])))
predicted_test= MNB_sen.predict(testX_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['sentiment'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['sentiment']))

The accuracy score of train data 0.7228330350294042
The accuracy score of test data 0.6357361963190185
------------test data-------
[[149  13   8]
 [  5  49   8]
 [284 157 631]]


In [None]:
print(Counter(predicted_test))
print(Counter(y['sentiment']))

Counter({2: 1072, 0: 170, 1: 62})
Counter({2: 647, 0: 438, 1: 219})


In [None]:
MNB = MultinomialNB()
MNB_sen=MNB.fit(trainX_tfidf,X['label_aspect'])

In [None]:
# def MNB_sentiment():
predicted_train = MNB_sen.predict(trainX_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['label_aspect'])))
predicted_test= MNB_sen.predict(testX_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['label_aspect'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['label_aspect']))

The accuracy score of train data 0.5699309639478394
The accuracy score of test data 0.4915644171779141
------------test data-------
[[  0   0   0   0   0   0   0   0   0   0   0   0]
 [ 12 509  25  87  41  38 108  18 108  42  22 103]
 [  0   0   0   1   0   0   0   0   0   0   0   0]
 [  0   1   0  10   0   0   0   0   0   0   0   0]
 [  0   0   0   1   2   0   0   0   0   1   0   0]
 [  0   0   0   0   0   1   0   0   0   0   0   0]
 [  0   8   2   1   0   0  51   1   4   0   0   2]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   3   0   0   1   0   2   1  17   0   0   2]
 [  0   1   0   0   0   0   1   0   0   1   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  12   0   4   1   0   4   1   2   2   0  50]]


In [None]:
print(Counter(predicted_test))
print(Counter(y['label_aspect']))

Counter({1: 1113, 11: 76, 6: 69, 8: 26, 3: 11, 4: 4, 9: 3, 5: 1, 2: 1})
Counter({1: 534, 6: 166, 11: 157, 8: 131, 3: 104, 9: 46, 4: 45, 5: 39, 2: 27, 10: 22, 7: 21, 0: 12})


#### ii. Logistic regression

In [None]:
# Logistic Regression model
logreg = LogisticRegression()
logreg.fit(trainX_tfidf,X['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Sentiment analysis

In [None]:
# def MNB_sentiment():
predicted_train = logreg.predict(trainX_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['sentiment'])))
predicted_test= logreg.predict(testX_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['sentiment'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['sentiment']))

The accuracy score of train data 0.8634620301713117
The accuracy score of test data 0.7507668711656442
------------test data-------
[[308  38  58]
 [ 20 116  34]
 [110  65 555]]


Aspect Analysis

In [None]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(trainX_tfidf,X['label_aspect'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# logistic regression on aspect
predicted_train = logreg.predict(trainX_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['label_aspect'])))
predicted_test= logreg.predict(testX_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['label_aspect'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['label_aspect']))

The accuracy score of train data 0.6816670928151368
The accuracy score of test data 0.6633435582822086
------------test data-------
[[  7   1   0   0   1   0   1   0   0   0   0   2]
 [  1 408   4  20   6   5  42   6  27  10   5  33]
 [  0   2  18   3   0   0   0   0   1   0   0   2]
 [  0  20   0  64   1   2   4   0   5   1   0   7]
 [  0   2   0   1  31   1   2   0   3   1   1   2]
 [  0   7   0   1   0  24   3   0   3   1   0   2]
 [  0  22   2   3   0   2  96   1   7   1   0   6]
 [  0   6   0   0   2   1   2   8   2   0   0   1]
 [  1  27   1   3   2   1   8   2  77   1   0   6]
 [  0   4   1   1   0   3   2   1   2  28   1   5]
 [  2   3   0   3   0   0   0   0   0   0  14   1]
 [  1  32   1   5   2   0   6   3   4   3   1  90]]


In [None]:
print(Counter(predicted_test))
print(Counter(y['label_aspect']))

Counter({1: 567, 11: 148, 6: 140, 8: 129, 3: 104, 9: 48, 4: 44, 5: 41, 2: 26, 10: 23, 7: 22, 0: 12})
Counter({1: 534, 6: 166, 11: 157, 8: 131, 3: 104, 9: 46, 4: 45, 5: 39, 2: 27, 10: 22, 7: 21, 0: 12})


#### iii. SVM

In [None]:
#SVM model
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(trainX_tfidf,X['sentiment'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
predicted_train = classifier_linear.predict(trainX_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['sentiment'])))
predicted_test= classifier_linear.predict(testX_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['sentiment'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['sentiment']))

The accuracy score of train data 0.8880081820506265
The accuracy score of test data 0.7661042944785276
------------test data-------
[[324  37  57]
 [ 17 116  31]
 [ 97  66 559]]


Aspect analysis

In [None]:
# SVM model
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(trainX_tfidf,X['label_aspect'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
# printing results of train and test
predicted_train = classifier_linear.predict(trainX_tfidf)
print("The accuracy score of train data {0}".format(accuracy_score(predicted_train, X['label_aspect'])))
predicted_test= classifier_linear.predict(testX_tfidf)
print("The accuracy score of test data {0}".format(accuracy_score(predicted_test, y['label_aspect'])))
print("------------test data-------")
print(confusion_matrix(predicted_test,y['label_aspect']))

The accuracy score of train data 0.6865251853745845
The accuracy score of test data 0.6648773006134969
------------test data-------
[[  7   1   0   0   1   0   1   0   0   0   0   2]
 [  1 408   4  19   6   5  41   6  26  10   5  32]
 [  0   2  18   3   0   0   0   0   1   0   0   2]
 [  0  20   0  64   1   2   4   0   5   1   0   7]
 [  0   2   0   1  31   1   2   0   3   1   1   2]
 [  0   7   0   1   0  24   3   0   3   1   0   2]
 [  0  22   2   3   0   2  96   1   7   1   0   6]
 [  0   6   0   0   2   1   2   8   2   0   0   1]
 [  1  27   1   4   2   1   8   2  78   1   0   6]
 [  0   4   1   1   0   3   2   1   2  28   1   5]
 [  2   3   0   3   0   0   0   0   0   0  14   1]
 [  1  32   1   5   2   0   7   3   4   3   1  91]]


In [None]:
print("Predicted aspect labels")
print(Counter(predicted_test))
print("Actual aspect labels")
print(Counter(y['label_aspect']))

Predicted aspect labels
Counter({1: 563, 11: 150, 6: 140, 8: 131, 3: 104, 9: 48, 4: 44, 5: 41, 2: 26, 10: 23, 7: 22, 0: 12})
Actual aspect labels
Counter({1: 534, 6: 166, 11: 157, 8: 131, 3: 104, 9: 46, 4: 45, 5: 39, 2: 27, 10: 22, 7: 21, 0: 12})
