In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, log_loss

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', None)

In [2]:
cols = ['index', 'app_id', 'app_name', 'review_score', 'review_votes', 'num_words', 'review_text', 'vader_score', 'sentiment']
df = pd.read_csv('df_reviews.csv', skiprows = 2, names = cols)

df = df[['review_text', 'sentiment']]
df = df.dropna()

In [3]:
max_features = 10000
min_df = 100
max_df = 0.95

In [4]:
scorings = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc_ovr_weighted']
cols = ['Accuracy Mean', 'Accuracy Std', 'Precision Mean', 'Precision Std', 'Recall Mean', 'Recall Std', 'F1 Mean', 'F1 Std', 'ROC AUC Mean', 'ROC AUC Std']
explore_df = pd.DataFrame(columns = cols)

def add_cv_scores(clf, clf_name):
    scores = cross_validate(
        clf,
        df['review_text'].values,
        df['sentiment'].values,
        scoring = scorings,
        cv = 5,
        error_score = 'raise'
    )

    explore_df.loc[clf_name] = [
        scores['test_accuracy'].mean(),
        scores['test_accuracy'].std(),
        scores['test_precision_weighted'].mean(),
        scores['test_precision_weighted'].std(),
        scores['test_recall_weighted'].mean(),
        scores['test_recall_weighted'].std(),
        scores['test_f1_weighted'].mean(),
        scores['test_f1_weighted'].std(),
        scores['test_roc_auc_ovr_weighted'].mean(),
        scores['test_roc_auc_ovr_weighted'].std()
    ]

In [5]:
%%time

LogisticRegression_clf = make_pipeline(TfidfVectorizer(
    max_features = max_features,
    min_df = min_df,
    max_df = max_df
), LogisticRegression(
    random_state = 0,
    class_weight = 'balanced'
))
add_cv_scores(LogisticRegression_clf, 'LogisticRegression')

CPU times: user 6min 10s, sys: 4min 22s, total: 10min 33s
Wall time: 4min 7s


In [6]:
%%time

LinearSVC_clf = make_pipeline(TfidfVectorizer(
    max_features = max_features,
    min_df = min_df,
    max_df = max_df
), CalibratedClassifierCV(LinearSVC(
    random_state = 0,
    class_weight = 'balanced'
)))
add_cv_scores(LinearSVC_clf, 'LinearSVC')

CPU times: user 7min 36s, sys: 0 ns, total: 7min 36s
Wall time: 7min 36s


In [7]:
%%time

MultinomialNB_clf = make_pipeline(TfidfVectorizer(
    max_features = max_features,
    min_df = min_df,
    max_df = max_df
), MultinomialNB())
add_cv_scores(MultinomialNB_clf, 'MultinomialNB')

CPU times: user 1min 7s, sys: 0 ns, total: 1min 7s
Wall time: 1min 7s


In [8]:
%%time

DecisionTreeClassifier_clf = make_pipeline(TfidfVectorizer(
    max_features = max_features,
    min_df = min_df,
    max_df = max_df
), DecisionTreeClassifier(
    max_depth = 30,
    random_state = 0,
    class_weight = 'balanced'
))
add_cv_scores(DecisionTreeClassifier_clf, 'DecisionTreeClassifier')

CPU times: user 12min, sys: 0 ns, total: 12min
Wall time: 12min


In [9]:
%%time

RandomForestClassifier_clf = make_pipeline(TfidfVectorizer(
    max_features = max_features,
    min_df = min_df,
    max_df = max_df
), RandomForestClassifier(
    max_depth = 30,
    random_state = 0,
    class_weight = 'balanced'
))
add_cv_scores(RandomForestClassifier_clf, 'RandomForestClassifier')

CPU times: user 15min 39s, sys: 0 ns, total: 15min 39s
Wall time: 15min 39s


In [10]:
%%time

GradientBoostingClassifier_clf = make_pipeline(TfidfVectorizer(
    max_features = max_features,
    min_df = min_df,
    max_df = max_df
), GradientBoostingClassifier(
    n_estimators = 100,
    learning_rate = 1,
    max_depth = 1,
    random_state = 0
))
add_cv_scores(GradientBoostingClassifier_clf, 'GradientBoostingClassifier')

CPU times: user 25min 1s, sys: 23 s, total: 25min 24s
Wall time: 25min 24s


In [11]:
explore_df

Unnamed: 0,Accuracy Mean,Accuracy Std,Precision Mean,Precision Std,Recall Mean,Recall Std,F1 Mean,F1 Std,ROC AUC Mean,ROC AUC Std
LogisticRegression,0.858827,0.023928,0.892028,0.011713,0.858827,0.023928,0.867609,0.02125,0.95844,0.011654
LinearSVC,0.901155,0.020165,0.901207,0.016626,0.901155,0.020165,0.899385,0.018353,0.957589,0.012711
MultinomialNB,0.752642,0.028355,0.734901,0.034287,0.752642,0.028355,0.69588,0.019638,0.851762,0.035355
DecisionTreeClassifier,0.764467,0.018233,0.824317,0.008215,0.764467,0.018233,0.780744,0.014905,0.841253,0.013505
RandomForestClassifier,0.779836,0.020376,0.837518,0.011789,0.779836,0.020376,0.795185,0.016728,0.897352,0.011098
GradientBoostingClassifier,0.852693,0.016357,0.848308,0.014146,0.852693,0.016357,0.847533,0.014351,0.925532,0.014567


In [12]:
explore_df.round(3)

Unnamed: 0,Accuracy Mean,Accuracy Std,Precision Mean,Precision Std,Recall Mean,Recall Std,F1 Mean,F1 Std,ROC AUC Mean,ROC AUC Std
LogisticRegression,0.859,0.024,0.892,0.012,0.859,0.024,0.868,0.021,0.958,0.012
LinearSVC,0.901,0.02,0.901,0.017,0.901,0.02,0.899,0.018,0.958,0.013
MultinomialNB,0.753,0.028,0.735,0.034,0.753,0.028,0.696,0.02,0.852,0.035
DecisionTreeClassifier,0.764,0.018,0.824,0.008,0.764,0.018,0.781,0.015,0.841,0.014
RandomForestClassifier,0.78,0.02,0.838,0.012,0.78,0.02,0.795,0.017,0.897,0.011
GradientBoostingClassifier,0.853,0.016,0.848,0.014,0.853,0.016,0.848,0.014,0.926,0.015


In [5]:
(X_train, X_test, y_train, y_test) = train_test_split(df['review_text'], df['sentiment'], random_state = 0)

vectorizer = TfidfVectorizer(
    max_features = max_features,
    min_df = min_df,
    max_df = max_df
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [6]:
cols = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score', 'Log Loss']
svc_df = pd.DataFrame(columns = cols)

C = [0.01, 0.1, 1, 10, 100]

In [7]:
def fit_score_predict(clf, clf_name):
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
    y_proba = clf.predict_proba(X_test_tfidf)
    
#     print(confusion_matrix(y_test, y_pred))    
#     print(classification_report(y_test, y_pred))

    svc_df.loc[clf_name] = [
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred, average = 'weighted'),
        recall_score(y_test, y_pred, average = 'weighted'),
        f1_score(y_test, y_pred, average = 'weighted'),
        roc_auc_score(y_test, y_proba, average = 'weighted', multi_class = 'ovr'),
        log_loss(y_test, y_proba)
    ]

In [16]:
%%time

for c in C:
    LinearSVC_clf = CalibratedClassifierCV(LinearSVC(
        random_state = 0,
        penalty = 'l2',
        loss = 'squared_hinge',
        class_weight = None,
        C = c
    ))
    fit_score_predict(LinearSVC_clf, 'L2 Squared ' + str(c))

CPU times: user 20min 39s, sys: 928 ms, total: 20min 40s
Wall time: 20min 40s


In [17]:
%%time

for c in C:
    LinearSVC_clf = CalibratedClassifierCV(LinearSVC(
        random_state = 0,
        penalty = 'l2',
        loss = 'squared_hinge',
        class_weight = 'balanced',
        C = c
    ))
    fit_score_predict(LinearSVC_clf, 'L2 Squared Balanced ' + str(c))

CPU times: user 25min 19s, sys: 1.14 s, total: 25min 20s
Wall time: 25min 20s


In [18]:
%%time

for c in C:
    LinearSVC_clf = CalibratedClassifierCV(LinearSVC(
        random_state = 0,
        penalty = 'l2',
        loss = 'hinge',
        class_weight = None,
        C = c
    ))
    fit_score_predict(LinearSVC_clf, 'L2 Hinge ' + str(c))

CPU times: user 7min 9s, sys: 1.24 s, total: 7min 10s
Wall time: 7min 10s


In [19]:
%%time

for c in C:
    LinearSVC_clf = CalibratedClassifierCV(LinearSVC(
        random_state = 0,
        penalty = 'l2',
        loss = 'hinge',
        class_weight = 'balanced',
        C = c
    ))
    fit_score_predict(LinearSVC_clf, 'L2 Hinge Balanced ' + str(c))

CPU times: user 9min 49s, sys: 1.42 s, total: 9min 51s
Wall time: 9min 51s


In [20]:
%%time

for c in C:
    LinearSVC_clf = CalibratedClassifierCV(LinearSVC(
        random_state = 0,
        penalty = 'l1',
        loss = 'squared_hinge',
        dual = False,
        class_weight = None,
        C = c
    ))
    fit_score_predict(LinearSVC_clf, 'L1 Squared ' + str(c))

CPU times: user 6min 43s, sys: 4.49 s, total: 6min 47s
Wall time: 6min 47s


In [21]:
%%time

for c in C:
    LinearSVC_clf = CalibratedClassifierCV(LinearSVC(
        random_state = 0,
        penalty = 'l1',
        loss = 'squared_hinge',
        dual = False,
        class_weight = 'balanced',
        C = c
    ))
    fit_score_predict(LinearSVC_clf, 'L1 Squared Balanced ' + str(c))

CPU times: user 6min 56s, sys: 4.67 s, total: 7min 1s
Wall time: 7min 1s


In [22]:
svc_df.sort_values(by = ['F1 Score'], ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Log Loss
L2 Hinge 10,0.942156,0.941141,0.942156,0.941224,0.966317,0.28712
L2 Hinge 100,0.942156,0.941139,0.942156,0.941217,0.966209,0.287827
L2 Hinge 1,0.941396,0.940298,0.941396,0.940394,0.966154,0.286668
L2 Hinge Balanced 100,0.936127,0.935032,0.936127,0.9354,0.970227,0.283781
L2 Hinge Balanced 10,0.935623,0.93449,0.935623,0.934867,0.970254,0.283839
L2 Hinge Balanced 1,0.935321,0.934148,0.935321,0.934519,0.970238,0.283289
L1 Squared 1,0.934318,0.932852,0.934318,0.933105,0.970257,0.273963
L2 Squared 1,0.933873,0.932379,0.933873,0.932623,0.970127,0.274619
L1 Squared 10,0.933434,0.931928,0.933434,0.93219,0.969964,0.2753
L2 Squared 10,0.933401,0.931894,0.933401,0.932154,0.969961,0.275308


In [23]:
svc_df.sort_values(by = ['F1 Score'], ascending=False).round(3)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Log Loss
L2 Hinge 10,0.942,0.941,0.942,0.941,0.966,0.287
L2 Hinge 100,0.942,0.941,0.942,0.941,0.966,0.288
L2 Hinge 1,0.941,0.94,0.941,0.94,0.966,0.287
L2 Hinge Balanced 100,0.936,0.935,0.936,0.935,0.97,0.284
L2 Hinge Balanced 10,0.936,0.934,0.936,0.935,0.97,0.284
L2 Hinge Balanced 1,0.935,0.934,0.935,0.935,0.97,0.283
L1 Squared 1,0.934,0.933,0.934,0.933,0.97,0.274
L2 Squared 1,0.934,0.932,0.934,0.933,0.97,0.275
L1 Squared 10,0.933,0.932,0.933,0.932,0.97,0.275
L2 Squared 10,0.933,0.932,0.933,0.932,0.97,0.275


In [8]:
%%time

final_clf = LinearSVC(
    random_state = 0,
    penalty = 'l2',
    loss = 'hinge',
    class_weight = None,
    C = 10
).fit(X_train_tfidf, y_train)

In [9]:
vocab = vectorizer.get_feature_names()
coefs = final_clf.coef_

print(len(vocab))

4110

In [26]:
cols = ['Largest', 'Weight (L)', 'Smallest', 'Weight (S)']
size = 10

for i in range(0, 3):
    print(final_clf.classes_[i])
    coef_df = pd.DataFrame(columns = cols)
    large = np.argsort(coefs[i])[::-1][:size]
    small = np.argsort(coefs[i])[:size]
    for j in range(0, size):
        coef_df = coef_df.append(pd.Series([
            vocab[large[j]],
            coefs[i][large[j]],
            vocab[small[j]],
            coefs[i][small[j]]
        ], index = cols), ignore_index = True)
    print(coef_df.round(2))
    print()

Negative
     Largest  Weight (L)   Smallest  Weight (S)
0  kill       11.82       great     -12.99     
1  hell       9.30        love      -10.68     
2  bad        8.68        amazing   -9.97      
3  murder     8.31        fun       -9.60      
4  die        7.93        awesome   -9.18      
5  dead       7.68        good      -9.16      
6  cancer     7.32        friend    -7.08      
7  terrorist  7.16        win       -6.84      
8  war        7.10        perfectly -6.75      
9  ruin       7.01        beautiful -6.68      

Neutral
          Largest  Weight (L)   Smallest  Weight (S)
0  overlook        2.47        good      -10.98     
1  manner          2.19        play      -10.70     
2  topic           2.10        great     -9.68      
3  accuracy        2.05        fun       -9.30      
4  accomplishment  1.91        love      -8.98      
5  vibe            1.91        like      -8.36      
6  instance        1.84        amazing   -7.62      
7  royale          1.78       

In [10]:
def merge_with_original(clf):
    complete_df = pd.concat([X_test, y_test], axis = 1)
    complete_df['predicted'] = clf.predict(X_test_tfidf)
    return complete_df.loc[~(complete_df['sentiment'] == complete_df['predicted'])]

In [11]:
incorrects = merge_with_original(final_clf)
incorrects.sample(10)

Unnamed: 0,review_text,sentiment,predicted
533496,decide remember good ending exact opposite flower want ll thank later bad pun send pun contest em unfortunately pun intendid,Negative,Positive
230356,opinion game simulator derangement course know machine strong game offensivei play battlefield frostbite high parameter play insurgencysource engine low parametersvalue game discountfor whatsorry terrible english,Positive,Negative
526594,play bit not wait play great game,Negative,Positive
60882,play duty game hour learn invaluable lesson life gun not kill people lag,Positive,Negative
408067,like game not know not log rocket league server,Positive,Neutral
12821,arma like arma apex buy expansion not play game lucky find stratis altis server not fps improvement mp server mod run fps low play game high setting fps sp,Negative,Positive
188217,not wrong,Positive,Negative
49834,smooth gameplay pair story shake twist bend mind solid,Neutral,Positive
557683,play tutorial preclude review rule not care bloody boring battle system dull good feature ability flee attempt humor pathetic laugh play papyrus intro summarise boring intro combat unfunny joke yes understand papyrussans thing lead refund game hour,Positive,Negative
321270,game logic wacke buy cause not worth crappn dlcs steamwhy,Negative,Positive


In [19]:
ids = [186915, 327265, 143740]

for i in ids:
    exists = []
    sen = incorrects.loc[i]['review_text']
    print(sen)
    words = sen.split()
    for word in words:
        if word in vocab:
            exists.append(word)
    print(exists)
    print()

love game hate update mac hater not hate time gmod decide update not install basically half time log play end able play
['love', 'game', 'hate', 'update', 'mac', 'hater', 'not', 'hate', 'time', 'gmod', 'decide', 'update', 'not', 'install', 'basically', 'half', 'time', 'log', 'play', 'end', 'able', 'play']

not walk bank like normal person anymore
['not', 'walk', 'bank', 'like', 'normal', 'person', 'anymore']

despite rage game well nicholas cage
['despite', 'rage', 'game', 'well']

