In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer


In [3]:
import os
import numpy as np

import warnings

import sklearn.linear_model
import sklearn.metrics

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn') # pretty matplotlib plots

x_train_df = pd.read_csv('../data/data_reviews/x_train.csv')
y_train_df = pd.read_csv('../data/data_reviews/y_train.csv')
x_test_df = pd.read_csv('../data/data_reviews/x_test.csv')

In [4]:
x_train = x_train_df['text'] 
x_test = x_test_df['text']
x_train

0       Oh and I forgot to also mention the weird colo...
1                            THAT one didn't work either.
2                                      Waste of 13 bucks.
3       Product is useless, since it does not have eno...
4       None of the three sizes they sent with the hea...
                              ...                        
2395    The sweet potato fries were very good and seas...
2396    I could eat their bruschetta all day it is dev...
2397                                 Ambience is perfect.
2398    We ordered the duck rare and it was pink and t...
2399         Service was good and the company was better!
Name: text, Length: 2400, dtype: object

In [5]:
amazon_i = np.where(x_train_df['website_name']=='amazon')
imdb_i = np.where(x_train_df['website_name']=='imdb')
yelp_i =  np.where(x_train_df['website_name']=='yelp')

## Preprocess

In [6]:
import pandas as pd
import numpy as np
import sklearn.neighbors

from collections import OrderedDict

word_embeddings = pd.read_csv('../data/pretrained_word_embeddings/glove.6B.50d.txt.zip',
                               header=None, sep=' ', index_col=0,
                               nrows=100000, compression='zip', encoding='utf-8', quoting=3)
# Build a dict that will map from string word to 50-dim vector
word_list = word_embeddings.index.values.tolist()
word2vec = OrderedDict(zip(word_list, word_embeddings.values))

## Show some examples


n_words = len(word2vec.keys())
# print("word2vec['happy'] = ")
# print(word2vec['happy'])

# print("word2vec['good'] = ")
# print(word2vec['good'])

## Try some analogies
def analogy_lookup(a1, a2, b1):
    target_vec = word2vec[a2] - word2vec[a1] + word2vec[b1]
    knn = sklearn.neighbors.NearestNeighbors(n_neighbors=7, metric='euclidean', algorithm='brute')
    knn.fit(word_embeddings.values)
    dists, indices = knn.kneighbors(target_vec[np.newaxis,:])
    print("Query: %s:%s -> %s:____" % (a1, a2, b1))
    for ii, vv in enumerate(indices[0]):
        print("   %20s  at dist %.3f" % (word_list[vv], dists[0,ii]))

# analogy_lookup('england', 'london', 'france')
# analogy_lookup('england', 'london', 'germany')
# analogy_lookup('england', 'london', 'japan')
# analogy_lookup('england', 'london', 'indonesia')

# analogy_lookup('swim', 'swimming', 'run')


analogy_lookup('movie', 'film', 'theater')

Query: movie:film -> theater:____
                theatre  at dist 1.977
                theater  at dist 2.158
                 cinema  at dist 3.639
                  opera  at dist 3.654
                 ballet  at dist 3.729
               ensemble  at dist 3.817
                 studio  at dist 3.967


In [7]:
import re

def simple_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z]", " ", str_input).lower().split()   
    return words

In [8]:
words = [simple_tokenizer(x_train[i]) for i in range(len(x_train))]

flat_list = sum(words, [])
len(flat_list)

28937

## Taking the Average

In [9]:
keys = word2vec.keys()

In [19]:
x_tr = []

for x in range(len(words)):
    for i in range(len(words[x])):
        filtered = []
        if (words[x][i] in keys):
            filtered.append(words[x][i])
        t = [word2vec[w] for w in filtered]
    x_tr.append(np.average(t, axis=1))
x = np.array(x_tr, dtype=object)

AxisError: axis 1 is out of bounds for array of dimension 1

**Note: tense, persons**

# Logistic Regression #

In [16]:
X = x
y = y_train_df['is_positive_sentiment'].to_numpy()
feat_num = X.shape

X.shape

(2400,)

In [27]:
import sklearn.metrics

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score

### CV without best C

In [29]:
n_runs = 16
tr_classifierLOG_SGD = list()

for i in range(n_runs):
    start_time_sec = time.time()
    mlp_sgd = MLPClassifier(
        hidden_layer_sizes=[2],
        activation='logistic',
        alpha=0.0001,
        max_iter=400, tol=1e-8,
        random_state=i,
        solver='sgd', batch_size=10,
        learning_rate='adaptive', learning_rate_init=0.1, momentum=0.0,
        )
    with warnings.catch_warnings(record=True) as warn_list:
        clf = mlp_sgd.fit(X, y)
    mlp_sgd.did_converge = True if len(warn_list) == 0 else False
    elapsed_time_sec = time.time() - start_time_sec
    print('finished SGD run %2d/%d after %6.1f sec | %3d epochs | %s | loss %.3f' % (
        i+1, n_runs, elapsed_time_sec,
        len(mlp_sgd.loss_curve_),
            'converged    ' if mlp_sgd.did_converge else 'NOT converged',
            mlp_sgd.loss_))

    tr_classifierLOG_SGD.append(clf)

finished SGD run  1/16 after   30.0 sec | 400 epochs | NOT converged | loss 0.023
finished SGD run  2/16 after   30.9 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run  3/16 after   32.0 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run  4/16 after   32.5 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run  5/16 after   30.5 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run  6/16 after   32.1 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run  7/16 after   31.3 sec | 400 epochs | NOT converged | loss 0.021
finished SGD run  8/16 after   31.6 sec | 400 epochs | NOT converged | loss 0.023
finished SGD run  9/16 after   30.8 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run 10/16 after   30.7 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run 11/16 after   31.0 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run 12/16 after   32.4 sec | 400 epochs | NOT converged | loss 0.023
finished SGD run

In [None]:
print("-----------------\nClassify with base data, 5 folds\n-----------------")

k = 3
kfold = KFold(n_splits=k)
train_scores = []
test_scores = []
# model = LogisticRegression()

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X[train_idx,:], X[test_idx,:]
    y_train, y_test = y[train_idx], y[test_idx]
    
    
    mlp_sgd = MLPClassifier(
            hidden_layer_sizes=[2],
            activation='logistic',
            alpha=0.0001,
            max_iter=400, tol=1e-8,
            random_state=i,
            solver='sgd', batch_size=10,
            learning_rate='adaptive', learning_rate_init=0.1, momentum=0.0,
            )
        with warnings.catch_warnings(record=True) as warn_list:
            clf = mlp_sgd.fit(X_train, y_train)
            pred_train = clf.predict_proba(X_train)
            pred_test = clf.predict_proba(X_test)
            
            score_train = clf.score(X_train, y_train)
            score_test = clf.score(X_test, y_test)
        mlp_sgd.did_converge = True if len(warn_list) == 0 else False
        elapsed_time_sec = time.time() - start_time_sec
        print('finished SGD run after %6.1f sec | %3d epochs | %s | loss %.3f' % (
              elapsed_time_sec,
            len(mlp_sgd.loss_curve_),
                'converged    ' if mlp_sgd.did_converge else 'NOT converged',
                mlp_sgd.loss_))

        tr_classifierLOG_SGD.append(clf)
    

 
    
    train_scores.append(score_train)
    test_scores.append(score_test)
    
print("\nAverage train accuracy: ", np.average(score_train))
print("Average test accuracy: ", np.average(score_test))


    n_runs = 16
    tr_classifierLOG_SGD = list()

    for i in range(n_runs):
        start_time_sec = time.time()
        mlp_sgd = MLPClassifier(
            hidden_layer_sizes=[2],
            activation='logistic',
            alpha=0.0001,
            max_iter=400, tol=1e-8,
            random_state=i,
            solver='sgd', batch_size=10,
            learning_rate='adaptive', learning_rate_init=0.1, momentum=0.0,
            )
        with warnings.catch_warnings(record=True) as warn_list:
            clf = mlp_sgd.fit(X, y)
        mlp_sgd.did_converge = True if len(warn_list) == 0 else False
        elapsed_time_sec = time.time() - start_time_sec
        print('finished SGD run %2d/%d after %6.1f sec | %3d epochs | %s | loss %.3f' % (
            i+1, n_runs, elapsed_time_sec,
            len(mlp_sgd.loss_curve_),
                'converged    ' if mlp_sgd.did_converge else 'NOT converged',
                mlp_sgd.loss_))

        tr_classifierLOG_SGD.append(clf)

### CV with the best C

In [None]:
C_grid = np.logspace(-9, 6, 31)
model_list = []
aver_train_score = []
aver_test_score = []
aver_train_loss = []
aver_test_loss = []


for C in C_grid:
    k = 3
    kfold = KFold(n_splits=k)
    
    train_scores = []
    test_scores = []
    train_loss = []
    test_loss = []

    model = sklearn.linear_model.LogisticRegression(C=C,solver='liblinear')
    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx,:], X[test_idx,:]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        pred_train = model.predict_proba(X_train)
        pred_test = model.predict_proba(X_test)

        # Score
        score_train = model.score(X_train, y_train)
        score_test = model.score(X_test, y_test)
#         print("Train score: ", score_train)
#         print("Test score: ", score_test)
        train_scores.append(score_train)
        test_scores.append(score_test)
        
        # Log loss
        log_loss_train = sklearn.metrics.log_loss(y_train,pred_train)
        log_loss_test = sklearn.metrics.log_loss(y_test,pred_test)
#         print("Train loss: ", log_loss_train)
#         print("Test loss: ", log_loss_test)
        train_loss.append(log_loss_train)
        test_loss.append(log_loss_test)
        
        
    print("\nFor C value : ", C)
    print("\nAverage train accuracy: ", np.average(score_train))
    print("Average test accuracy: ", np.average(score_test))
    print("\nAverage train loss: ", np.average(train_loss))
    print("Average test loss: ", np.average(test_loss))
    
    print('------------------------------------------------\n')
    
    model_list.append(model)
    aver_train_score.append(np.average(score_train))
    aver_test_score.append(np.average(score_test))
    aver_train_loss.append(np.average(train_loss))
    aver_test_loss.append(np.average(test_loss))

In [None]:
# print out the results in clear tabular format
pd.DataFrame(np.transpose([aver_train_score, aver_test_score, aver_train_loss, aver_test_loss]), columns=['train accuracy', 'test accuracy', 'train loss', 'test loss'])

#### Best Log loss

In [None]:
min_loss = min(aver_test_loss)
index_N2 = aver_test_loss.index(min_loss)
best_C =  C_grid[index_N2]
best_model = model_list[index_N2]

#### stability across Kfolds

In [None]:
print("-----------------\nClassify with base data, 5 folds\n-----------------")

K = [2,3,4,5,6,7,8,9,10,11,12]
K_train_loss = []
K_test_loss = []
for k in K:
    kfold = KFold(n_splits=k)
    train_scores = []
    test_scores = []
    train_loss = []
    test_loss = []
    
    for train_idx, test_idx in kfold.split(X):
        shuffler = np.random.permutation(len(X))
        X_shuffled = X[shuffler]
        y_shuffled = y[shuffler]
        X_train, X_test = X_shuffled[train_idx,:], X_shuffled[test_idx,:]
        y_train, y_test = y_shuffled[train_idx], y_shuffled[test_idx]

        model.fit(X_train, y_train)
        pred_train = best_model.predict_proba(X_train)
        pred_test = best_model.predict_proba(X_test)

        score_train = best_model.score(X_train, y_train)
        score_test = best_model.score(X_test, y_test)
        train_scores.append(score_train)
        test_scores.append(score_test)
        
        log_loss_train = sklearn.metrics.log_loss(y_train,pred_train)
        log_loss_test = sklearn.metrics.log_loss(y_test,pred_test)
        
        train_loss.append(log_loss_train)
        test_loss.append(log_loss_test)

    print("\nAverage train accuracy: ", np.average(score_train))
    print("Average test accuracy: ", np.average(score_test))
    print("Average train loss: ", np.average(train_loss))
    print("Average test loss: ", np.average(test_loss))
    
    K_train_loss.append(np.average(train_loss))
    K_test_loss.append(np.average(test_loss))

In [None]:
plt.xlabel('K from 2 to 12');
plt.ylabel('logistic loss');

sns.lineplot(x = K, y = K_train_loss, label = "Train Loss", color = "red", marker='o')
sns.lineplot(x = K, y = K_test_loss, label = "Test Loss", color = "blue", marker='o')

# show a legend on the plot 
plt.legend() 
plt.title('Log loss across K values')
plt.show()


print('standard deviation for training set: %.3f  ' %np.std(K_train_loss))
print('standard deviation for testing set: %.3f  ' %np.std(K_test_loss))

In [None]:
plt.xlabel('log10(C)');
plt.ylabel('logistic loss');
plt.ylim([0.0, 1]);

sns.lineplot(x = np.log10(C_grid), y = aver_train_loss, label = "Train Loss", color = "red", marker='o')
sns.lineplot(x = np.log10(C_grid), y = aver_test_loss,label = "Test Loss", color = "blue", marker='o')

# show a legend on the plot 
plt.legend() 
plt.title('Logistic loss on C-grid')
plt.show()

print("Best C-value for LR: %.3f" % best_C) 
print("Test set log-loss at best C-value: %.4f" % min_loss)

#### Result

In [None]:
yproba1_test = best_model.predict_proba(x_te)[:, 1] 
np.savetxt('yproba1_test.txt', yproba1_test)

# Neuronetwork

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
n_runs = 16
tr_classifierLBFS = list()

for i in range(n_runs):
    start_time_sec = time.time()
    mlp_lbfgs = MLPClassifier(
        hidden_layer_sizes=[2],
        activation='relu',
        alpha=0.0001,
        max_iter=200, tol=1e-6,
        random_state=i,
        )
    with warnings.catch_warnings(record=True) as warn_list:
        clf = mlp_lbfgs.fit(x_tr_N2, y_tr_N)
    elapsed_time_sec = time.time() - start_time_sec
    print('finished LBFGS run %2d/%d after %6.1f sec | %3d iters | %s | loss %.3f' % (
        i+1, n_runs, elapsed_time_sec,
        len(mlp_lbfgs.loss_curve_),
        'converged   ' if mlp_lbfgs.did_converge else 'NOT converged',
        mlp_lbfgs.loss_))
        
    tr_classifierLBFS.append(clf)