In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
import markovify
import pickle
import itertools

In [2]:
amazon_raw = pd.read_csv('train.csv', header=None)
amazon = amazon_raw.iloc[0:100000,:]
amazon.columns = ['polarity', 'title', 'text']
amazon.polarity.replace({1:0, 2:1}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [3]:
amazon.isna().sum()

polarity    0
title       3
text        0
dtype: int64

In [4]:
ecom_raw = pd.read_csv('ecom_reviews.csv')
ecom = ecom_raw.iloc[:,4:6]
ecom.columns = ['text', 'rating']
#ecom['polarity'] = ecom.rating.map(lambda x: 0 if x < 3 else 1 if x > 3 else 3)
ecom['polarity'] = [0 if x < 3 else 1 if x > 3 else 3 for x in ecom.rating]
ecom_unfiltered = ecom

In [5]:
ecom = ecom[ecom['text'].notna()]
ecom = ecom.iloc[:, [2,0]]
ecom = ecom[ecom.polarity.isin([0,1])]
ecom.polarity.value_counts()

1    17448
0     2370
Name: polarity, dtype: int64

In [6]:
ecom.head(10)

Unnamed: 0,polarity,text
0,1,Absolutely wonderful - silky and sexy and comf...
1,1,Love this dress! it's sooo pretty. i happene...
3,1,"I love, love, love this jumpsuit. it's fun, fl..."
4,1,This shirt is very flattering to all due to th...
5,0,"I love tracy reese dresses, but this one is no..."
6,1,I aded this in my basket at hte last mintue to...
7,1,"I ordered this in carbon for store pick up, an..."
8,1,I love this dress. i usually get an xs but it ...
9,1,"I'm 5""5' and 125 lbs. i ordered the s petite t..."
11,1,This dress is perfection! so pretty and flatte...


In [7]:
def naive_bayes(training_data = amazon, test_data = ecom):
    # Data
    X_train = training_data['text']
    y_train = training_data['polarity']
    X_test = test_data['text']  
    
    # Pipeline
    pipe = Pipeline([('cv', CountVectorizer()),
                     ('clf', MultinomialNB())], verbose = True)
    # Fit training data
    pipe.fit(X_train, y_train)
        
    # Predict labels for test data
    y_pred = pipe.predict(X_test)
    
    return y_pred

# Print the result
y_test = ecom['polarity']
print(classification_report(y_test, naive_bayes()))

[Pipeline] ................ (step 1 of 2) Processing cv, total=   4.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
              precision    recall  f1-score   support

           0       0.33      0.90      0.48      2370
           1       0.98      0.75      0.85     17448

    accuracy                           0.77     19818
   macro avg       0.65      0.82      0.66     19818
weighted avg       0.90      0.77      0.80     19818



In [8]:
17448/ecom.shape[0]

0.8804117468967605

In [9]:
print(classification_report(y_test, naive_bayes(amazon[0:1000])))

[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
              precision    recall  f1-score   support

           0       0.30      0.76      0.44      2370
           1       0.96      0.76      0.85     17448

    accuracy                           0.76     19818
   macro avg       0.63      0.76      0.64     19818
weighted avg       0.88      0.76      0.80     19818



In [10]:
#amazon_text
amazon_rated0 = amazon[amazon.polarity == 0]['text']
text_r0 = '\n'.join([i for i in amazon_rated0.apply(str)])


In [11]:
amazon_rated1 = amazon[amazon.polarity == 1]['text']
text_r1 = '\n'.join([i for i in amazon_rated1.apply(str)])


In [12]:
amazon_r0 = markovify.NewlineText(text_r0)
amazon_r0.compile(inplace = True)

<markovify.text.NewlineText at 0x2668f3a87c0>

In [13]:
amazon_r1 = markovify.NewlineText(text_r1)
amazon_r1.compile(inplace = True)

<markovify.text.NewlineText at 0x2668f3a8b80>

In [114]:
# pickle.dump(amazon_r0, open('amazon_r0.pkl', 'wb'))
# pickle.dump(amazon_r1, open('amazon_r1.pkl', 'wb'))

In [14]:
amazon_r0_synth = [amazon_r0.make_sentence() for i in range(1000)]

In [15]:
amazon_r1_synth = [amazon_r1.make_sentence() for i in range(1000)]

In [16]:
amazon_r0_synth_df = pd.DataFrame({'polarity' : itertools.repeat(0, 1000), 'text' : amazon_r0_synth})
amazon_r1_synth_df = pd.DataFrame({'polarity' : itertools.repeat(1, 1000), 'text' : amazon_r1_synth})

In [17]:
amazon_synth = pd.concat([amazon_r0_synth_df, amazon_r1_synth_df])

In [18]:
amazon_synth.shape

(2000, 2)

In [19]:
y_test = ecom['polarity']
print(classification_report(y_test, naive_bayes(training_data = amazon_synth)))

[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
              precision    recall  f1-score   support

           0       0.30      0.86      0.44      2370
           1       0.97      0.73      0.83     17448

    accuracy                           0.74     19818
   macro avg       0.64      0.79      0.64     19818
weighted avg       0.89      0.74      0.79     19818



In [20]:
from sklearn import svm

# params = {
#     'cv__binary' : (False, True),
#     'cv__ngram_range' : ((1, 1), (1, 2)),
#     'clf__C' : (1, 0.1),
#     'clf__gamma' : ('scale', 'auto')
# }

def classify(classifier, training_data = amazon_synth, test_data = ecom):
    # Data
    X_train = training_data['text']
    y_train = training_data['polarity']
    X_test = test_data['text']  
    
    # Pipeline
    pipe = Pipeline([('cv', CountVectorizer()),
                     ('clf', classifier)], verbose = True)
    # Fit training data
    pipe.fit(X_train, y_train)
        
    # Predict labels for test data
    y_pred = pipe.predict(X_test)
    
    return y_pred

In [20]:
svm_synth = svm.SVC()
y_pred_svm = classify(svm_synth)
print(classification_report(y_test, y_pred_svm))

[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   1.4s
              precision    recall  f1-score   support

           0       0.25      0.76      0.38      2370
           1       0.95      0.69      0.80     17448

    accuracy                           0.70     19818
   macro avg       0.60      0.72      0.59     19818
weighted avg       0.87      0.70      0.75     19818



In [None]:
svm_org = svm.SVC()
y_pred_svm = classify(classifier = svm_org, training_data = amazon[0:50000])
print(classification_report(y_test, y_pred_svm))

[Pipeline] ................ (step 1 of 2) Processing cv, total=   2.7s


In [30]:
def classify(classifier, params, training_data=amazon_synth, test_data=ecom):
    
    X_train = training_data['text']
    y_train = training_data['polarity']
    X_test = test_data['text']  
        
    pipe = Pipeline(
        [
            ('cv', CountVectorizer()),
            ('clf', classifier)
        ]
    )

    grid_search = GridSearchCV(pipe, params, cv = 3, verbose = 3)
    grid_search.fit(X_train, y_train)
    
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    return grid_search.predict(X_test)

In [36]:
# neural net
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'cv__binary' : (True,),
    'cv__ngram_range' : ((1, 1),),
    'clf__activation' : ('tanh',),
    'clf__hidden_layer_sizes' : ((200, 200),),
    'clf__learning_rate_init' : (0.002,)
}

mlp = MLPClassifier(max_iter = 800)
y_pred_mlp = classify(mlp, params, training_data = amazon[0:5000])
print(classification_report(y_test, y_pred_mlp))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END clf__activation=tanh, clf__hidden_layer_sizes=(200, 200), clf__learning_rate_init=0.002, cv__binary=True, cv__ngram_range=(1, 1);, score=0.793 total time=  37.9s
[CV 2/3] END clf__activation=tanh, clf__hidden_layer_sizes=(200, 200), clf__learning_rate_init=0.002, cv__binary=True, cv__ngram_range=(1, 1);, score=0.785 total time=  38.4s
[CV 3/3] END clf__activation=tanh, clf__hidden_layer_sizes=(200, 200), clf__learning_rate_init=0.002, cv__binary=True, cv__ngram_range=(1, 1);, score=0.791 total time=  41.0s
	clf__activation: 'tanh'
	clf__hidden_layer_sizes: (200, 200)
	clf__learning_rate_init: 0.002
	cv__binary: True
	cv__ngram_range: (1, 1)
              precision    recall  f1-score   support

           0       0.27      0.79      0.40      2370
           1       0.96      0.70      0.81     17448

    accuracy                           0.72     19818
   macro avg       0.61      0.75      0.61     19818
weighte

In [34]:
params = {
    'cv__binary' : (True, False,),
    'cv__ngram_range' : ((1, 1),),
    'clf__activation' : ('tanh',),
    'clf__hidden_layer_sizes' : ((200, 200),),
    'clf__learning_rate_init' : (0.002,)
}

mlp_synth = MLPClassifier(max_iter = 500)
y_pred_mlp_synth = classify(mlp, params)
print(classification_report(y_test, y_pred_mlp_synth))

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3] END clf__activation=tanh, clf__hidden_layer_sizes=(200, 200), clf__learning_rate_init=0.002, cv__binary=True, cv__ngram_range=(1, 1);, score=0.820 total time=  11.3s
[CV 2/3] END clf__activation=tanh, clf__hidden_layer_sizes=(200, 200), clf__learning_rate_init=0.002, cv__binary=True, cv__ngram_range=(1, 1);, score=0.786 total time=  11.4s
[CV 3/3] END clf__activation=tanh, clf__hidden_layer_sizes=(200, 200), clf__learning_rate_init=0.002, cv__binary=True, cv__ngram_range=(1, 1);, score=0.791 total time=  11.0s
[CV 1/3] END clf__activation=tanh, clf__hidden_layer_sizes=(200, 200), clf__learning_rate_init=0.002, cv__binary=False, cv__ngram_range=(1, 1);, score=0.784 total time=  11.2s
[CV 2/3] END clf__activation=tanh, clf__hidden_layer_sizes=(200, 200), clf__learning_rate_init=0.002, cv__binary=False, cv__ngram_range=(1, 1);, score=0.814 total time=  13.5s
[CV 3/3] END clf__activation=tanh, clf__hidden_layer_sizes=(200,