This SVM model contains two parts:
1. One is to classify aspects given the reviews
2. Classify sentiments as posistive, negative and neutral given the review

In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

In [4]:
# Read Data from csv
df_train = pd.read_csv('laptop_train_cleaned.csv')
df_test = pd.read_csv('laptop_test_cleaned.csv')
# print(df.columns)

In [28]:
# Training data
X_train = df_train['reviews']
X_test = df_test['reviews']
len(X_train)
len(X_test)

808

In [6]:
# Labels
y_train = df_train.drop(columns = ['reviews', 'Unnamed: 0', 'aspects', 'polarity'])
y_train = y_train.reindex(sorted(y_train.columns), axis=1)
y_test = df_test.drop(columns = ['reviews','Unnamed: 0', 'aspects', 'polarity'])
y_test = y_test.reindex(sorted(y_test.columns), axis=1)

In [7]:
display(y_train)
display(y_test)

Unnamed: 0,BATTERY#OPERATION_PERFORMANCE,BATTERY#QUALITY,COMPANY#GENERAL,CPU#DESIGN_FEATURES,CPU#MISCELLANEOUS,CPU#OPERATION_PERFORMANCE,DISPLAY#DESIGN_FEATURES,DISPLAY#GENERAL,DISPLAY#OPERATION_PERFORMANCE,DISPLAY#QUALITY,...,SHIPPING#QUALITY,SOFTWARE#DESIGN_FEATURES,SOFTWARE#GENERAL,SOFTWARE#OPERATION_PERFORMANCE,SOFTWARE#PRICE,SOFTWARE#QUALITY,SOFTWARE#USABILITY,SUPPORT#PRICE,SUPPORT#QUALITY,WARRANTY#GENERAL
0,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
1,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
2,positive,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
3,nothing,nothing,nothing,nothing,nothing,positive,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
4,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2494,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
2495,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
2496,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
2497,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing


Unnamed: 0,BATTERY#OPERATION_PERFORMANCE,BATTERY#QUALITY,COMPANY#GENERAL,CPU#DESIGN_FEATURES,CPU#MISCELLANEOUS,CPU#OPERATION_PERFORMANCE,DISPLAY#DESIGN_FEATURES,DISPLAY#GENERAL,DISPLAY#OPERATION_PERFORMANCE,DISPLAY#QUALITY,...,SHIPPING#QUALITY,SOFTWARE#DESIGN_FEATURES,SOFTWARE#GENERAL,SOFTWARE#OPERATION_PERFORMANCE,SOFTWARE#PRICE,SOFTWARE#QUALITY,SOFTWARE#USABILITY,SUPPORT#PRICE,SUPPORT#QUALITY,WARRANTY#GENERAL
0,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
1,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
2,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
3,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
4,nothing,nothing,positive,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
803,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
804,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
805,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing
806,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,...,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing,nothing


In [8]:
# Transform text into features
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer = TfidfVectorizer(max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
X_train = vectorizer.fit_transform(df_train['reviews'])
X_test = vectorizer.transform(df_test['reviews'])

In [9]:
# Get Aspect Label Dataframe
y_train_aspect = pd.DataFrame()
for aspect in y_train.columns:
    y_train_aspect[aspect] = y_train[aspect].replace(to_replace ='positive', value = 1)  
    y_train_aspect[aspect] = y_train_aspect[aspect].replace(to_replace ='negative', value = 1)  
    y_train_aspect[aspect] = y_train_aspect[aspect].replace(to_replace ='neutral', value = 1)  
    y_train_aspect[aspect] = y_train_aspect[aspect].replace(to_replace ='nothing', value = 0)  
    
y_test_aspect = pd.DataFrame()
for aspect in y_train.columns:
    y_test_aspect[aspect] = y_test[aspect].replace(to_replace ='positive', value = 1)  
    y_test_aspect[aspect] = y_test_aspect[aspect].replace(to_replace ='negative', value = 1)  
    y_test_aspect[aspect] = y_test_aspect[aspect].replace(to_replace ='neutral', value = 1)  
    y_test_aspect[aspect] = y_test_aspect[aspect].replace(to_replace ='nothing', value = 0)      

In [10]:
# Get Positive Label Dataframe
y_train_positive = pd.DataFrame()
for aspect in y_train.columns:
    y_train_positive[aspect] = y_train[aspect].replace(to_replace ='positive', value = 1)    
    y_train_positive[aspect] = y_train_positive[aspect].replace(to_replace='negative', value = 0)
    y_train_positive[aspect] = y_train_positive[aspect].replace(to_replace='neutral', value = 0)
    y_train_positive[aspect] = y_train_positive[aspect].replace(to_replace='nothing', value = 0)
    
y_test_positive = pd.DataFrame()
for aspect in y_train.columns:
    y_test_positive[aspect] = y_test[aspect].replace(to_replace ='positive', value = 1)    
    y_test_positive[aspect] = y_test_positive[aspect].replace(to_replace='negative', value = 0)
    y_test_positive[aspect] = y_test_positive[aspect].replace(to_replace='neutral', value = 0)
    y_test_positive[aspect] = y_test_positive[aspect].replace(to_replace='nothing', value = 0)    

In [11]:
# For Negative Label Dataframe
y_train_negative = pd.DataFrame()
for aspect in y_train.columns:
    y_train_negative[aspect] = y_train[aspect].replace(to_replace ='negative', value = 1)    
    y_train_negative[aspect] = y_train_negative[aspect].replace(to_replace='positive', value = 0)
    y_train_negative[aspect] = y_train_negative[aspect].replace(to_replace='neutral', value = 0)
    y_train_negative[aspect] = y_train_negative[aspect].replace(to_replace='nothing', value = 0)
    
y_test_negative = pd.DataFrame()
for aspect in y_train.columns:
    y_test_negative[aspect] = y_test[aspect].replace(to_replace ='negative', value = 1)    
    y_test_negative[aspect] = y_test_negative[aspect].replace(to_replace='positive', value = 0)
    y_test_negative[aspect] = y_test_negative[aspect].replace(to_replace='neutral', value = 0)
    y_test_negative[aspect] = y_test_negative[aspect].replace(to_replace='nothing', value = 0)    

In [12]:
# Get Neutral Dataset
y_train_neutral = pd.DataFrame()
for aspect in y_train.columns:
    y_train_neutral[aspect] = y_train[aspect].replace(to_replace ='neutral', value = 1)    
    y_train_neutral[aspect] = y_train_neutral[aspect].replace(to_replace='positive', value = 0)
    y_train_neutral[aspect] = y_train_neutral[aspect].replace(to_replace='negative', value = 0)
    y_train_neutral[aspect] = y_train_neutral[aspect].replace(to_replace='nothing', value = 1)
    
y_test_neutral = pd.DataFrame()
for aspect in y_train.columns:
    y_test_neutral[aspect] = y_test[aspect].replace(to_replace ='neutral', value = 1)    
    y_test_neutral[aspect] = y_test_neutral[aspect].replace(to_replace='positive', value = 0)
    y_test_neutral[aspect] = y_test_neutral[aspect].replace(to_replace='negative', value = 0)
    y_test_neutral[aspect] = y_test_neutral[aspect].replace(to_replace='nothing', value = 1)    

In [13]:
from sklearn import metrics
def print_metrices(y_test,y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd):
    print("Accuracy:")
    print(metrics.accuracy_score(y_test,y_pred_class))
    print(metrics.accuracy_score(y_test,y_pred_class_svc))
    print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))
    print(metrics.accuracy_score(y_test,y_pred_class_sgd))

    print("\nAverage precision:")
    print(metrics.precision_score(y_test,y_pred_class,average='micro'))
    print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))
    print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))
    print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))

    print("\nAverage recall:")
    print(metrics.recall_score(y_test,y_pred_class,average='micro'))
    print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))
    print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))
    print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))
    
    print("\nAverage f1:")
    print(metrics.f1_score(y_test,y_pred_class,average='micro'))
    print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))
    print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))
    print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))

    print("\nClassification report:")
    print(metrics.classification_report(y_test, y_pred_class))
    print(metrics.classification_report(y_test, y_pred_class_svc))
    print(metrics.classification_report(y_test, y_pred_class_lin_svc))
    print(metrics.classification_report(y_test, y_pred_class_sgd))

In [14]:
# Part 1: Aspect Extraction

#Create various models. These are multi-label models.
nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train, y_train_aspect) #MULTINOMIAL NAIVE BAYES
C = 1.0 #SVregularization parameter
svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train, y_train_aspect) #SVM LINEAR - L1 NORM
lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train, y_train_aspect) #L2 NORM SVM
sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train,y_train_aspect)

In [17]:
y_pred_class = nb_classif.predict(X_test)
y_pred_class_svc = svc.predict(X_test)
y_pred_class_lin_svc = lin_svc.predict(X_test)
y_pred_class_sgd = sgd.predict(X_test)

# from sklearn import metrics
# print('MultiNomial Nave Bayes', metrics.accuracy_score(y_test_aspect.values,y_pred_class))

# from sklearn import metrics
# print('L1 norm SVM', metrics.accuracy_score(y_test_aspect.values,y_pred_class_svc))

# from sklearn import metrics
# print('L2 norm SVM', metrics.accuracy_score(y_test_aspect.values,y_pred_class_lin_svc))

# from sklearn import metrics
# print('SGD', metrics.accuracy_score(y_test_aspect.values,y_pred_class_sgd))

print_metrices(y_test_aspect,y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd)

Accuracy:
0.4282178217821782
0.48886138613861385
0.4876237623762376
0.4777227722772277

Average precision:
0.75
0.7449664429530202
0.6766169154228856
0.5572755417956656

Average recall:
0.004784688995215311
0.17703349282296652
0.21690590111642744
0.28708133971291866

Average f1:
0.009508716323296354
0.2860824742268041
0.3285024154589372
0.37894736842105264

Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00        38
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00        13
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         8
           9       0.00      0.00      0.00        20
          10       0.0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [16]:
display(y_train_neutral)

Unnamed: 0,BATTERY#OPERATION_PERFORMANCE,BATTERY#QUALITY,COMPANY#GENERAL,CPU#DESIGN_FEATURES,CPU#MISCELLANEOUS,CPU#OPERATION_PERFORMANCE,DISPLAY#DESIGN_FEATURES,DISPLAY#GENERAL,DISPLAY#OPERATION_PERFORMANCE,DISPLAY#QUALITY,...,SHIPPING#QUALITY,SOFTWARE#DESIGN_FEATURES,SOFTWARE#GENERAL,SOFTWARE#OPERATION_PERFORMANCE,SOFTWARE#PRICE,SOFTWARE#QUALITY,SOFTWARE#USABILITY,SUPPORT#PRICE,SUPPORT#QUALITY,WARRANTY#GENERAL
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,0,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2494,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2495,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2496,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2497,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [23]:
# Part 2: Classifying Sentiments
def classify_sentiments(X_train, y_train, X_test, y_test):
    C = 1.0 #SVregularization parameter
    nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train, y_train)
    svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train, y_train)
    lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train, y_train)
    sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train,y_train)

    y_pred_class = nb_classif.predict(X_test)
    y_pred_class_svc = svc.predict(X_test)
    y_pred_class_lin_svc = lin_svc.predict(X_test)
    y_pred_class_sgd = sgd.predict(X_test)
#     from sklearn import metrics
#     print('MultiNomial Nave Bayes', metrics.accuracy_score(y_test.values,y_pred_class))

#     from sklearn import metrics
#     print('L1 norm SVM', metrics.accuracy_score(y_test.values,y_pred_class_svc))

#     from sklearn import metrics
#     print('L2 norm SVM', metrics.accuracy_score(y_test.values,y_pred_class_lin_svc))

#     from sklearn import metrics
#     print('SGD', metrics.accuracy_score(y_test.values,y_pred_class_sgd))

    print_metrices(y_test, y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd)
    return

In [24]:
# Classifying Positive Sentiments
import warnings
warnings.filterwarnings("ignore")

classify_sentiments(X_train, y_train_positive, X_test, y_test_positive)

Accuracy:
0.6844059405940595
0.7029702970297029
0.7066831683168316
0.7017326732673267

Average precision:
0.0
0.7619047619047619
0.7142857142857143
0.60431654676259

Average recall:
0.0
0.13714285714285715
0.18571428571428572
0.24

Average f1:
0.0
0.23244552058111384
0.29478458049886624
0.34355828220858897

Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        17
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00        16
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00        14
          10       0.00      0.00      0.00         1
          11       

In [25]:
# Classifying Negative Sentiments
import warnings
warnings.filterwarnings("ignore")

classify_sentiments(X_train, y_train_negative, X_test, y_test_negative)

Accuracy:
0.7648514851485149
0.7698019801980198
0.7710396039603961
0.75

Average precision:
0.0
0.5555555555555556
0.5555555555555556
0.34782608695652173

Average recall:
0.0
0.021367521367521368
0.0641025641025641
0.10256410256410256

Average f1:
0.0
0.0411522633744856
0.11494252873563218
0.15841584158415842

Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00         3
          11    

In [26]:
# Classifying Neutral Sentiments
import warnings
warnings.filterwarnings("ignore")

classify_sentiments(X_train, y_train_neutral, X_test, y_test_neutral)

Accuracy:
0.4591584158415842
0.5086633663366337
0.5173267326732673
0.5024752475247525

Average precision:
0.9877700860079714
0.98950821050861
0.9901690419552449
0.9909255898366606

Average recall:
0.9999787631668365
0.9994478423377506
0.9988956846755012
0.9971967380224261

Average f1:
0.9938369319740812
0.9944531902080319
0.9945132200737913
0.9940512733662171

Classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       789
           1       0.99      1.00      1.00       803
           2       0.95      1.00      0.98       770
           3       1.00      1.00      1.00       805
           4       1.00      1.00      1.00       807
           5       1.00      1.00      1.00       806
           6       0.99      1.00      0.99       797
           7       1.00      1.00      1.00       804
           8       0.99      1.00      1.00       802
           9       0.98      1.00      0.99       789
          10       

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       789
           1       0.99      1.00      1.00       803
           2       0.96      0.99      0.98       770
           3       1.00      1.00      1.00       805
           4       1.00      1.00      1.00       807
           5       1.00      1.00      1.00       806
           6       0.99      1.00      0.99       797
           7       1.00      0.99      0.99       804
           8       0.99      1.00      1.00       802
           9       0.98      1.00      0.99       789
          10       1.00      1.00      1.00       804
          11       1.00      1.00      1.00       806
          12       1.00      1.00      1.00       807
          13       1.00      1.00      1.00       807
          14       1.00      1.00      1.00       804
          15       0.99      1.00      1.00       797
          16       1.00      1.00      1.00       807
          17       0.99    