In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer


In [2]:
import os
import numpy as np

import warnings

import sklearn.linear_model
import sklearn.metrics

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn') # pretty matplotlib plots

x_train_df = pd.read_csv('../data/data_reviews/x_train.csv')
y_train_df = pd.read_csv('../data/data_reviews/y_train.csv')
x_test_df = pd.read_csv('../data/data_reviews/x_test.csv')

In [3]:
x_train = x_train_df['text'] 
x_test = x_test_df['text']
x_test

0      Technically, the film is well made with impres...
1      !....THE OWNERS REALLY REALLY need to quit bei...
2                                  what a disappointment
3              The movie is terribly boring in places.  
4      One of the best mexican movies ever!, and one ...
                             ...                        
595      This is a great restaurant at the Mandalay Bay.
596    I could care less... The interior is just beau...
597    The only consistent thread holding the series ...
598    My side Greek salad with the Greek dressing wa...
599    However, my recent experience at this particul...
Name: text, Length: 600, dtype: object

In [4]:
amazon_i = np.where(x_train_df['website_name']=='amazon')
imdb_i = np.where(x_train_df['website_name']=='imdb')
yelp_i =  np.where(x_train_df['website_name']=='yelp')

## Preprocess

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

#### simple processing

In [None]:
count_vectorizer = CountVectorizer()
x = count_vectorizer.fit_transform(x_train)
print('Num of feat: ', len(x.toarray()[0]))
print(count_vectorizer.get_feature_names())
print(x.toarray())
pd.DataFrame(x.toarray(), columns=count_vectorizer.get_feature_names())

### Build your own tokenizer ### 

In [8]:
import re

def simple_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z]", " ", str_input).lower().split()
    
#     def prune_food(w):
#         if w == 'bones' or w == 'bone' or w == 'fish' or w == 'worms' or w == 'worm':
#             w = 'food'
#         return w
    
#     words = [prune_food(word) for word in words]
    
    return words

In [None]:
count_vectorizer = CountVectorizer(stop_words='english',tokenizer=simple_tokenizer)
x = count_vectorizer.fit_transform(x_train)
print('Num of feat: ', len(x.toarray()[0]))
print(count_vectorizer.get_feature_names())
print(x.toarray())
pd.DataFrame(x.toarray(), columns=count_vectorizer.get_feature_names())

In [13]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [14]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
porterstemmer = PorterStemmer()
wnl = WordNetLemmatizer()

In [15]:
import nltk
nltk.download()
# nltk.tag.pos_tag("I am named John Doe".split())

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


KeyboardInterrupt: 

In [16]:
# same stem 
print(porterstemmer.stem('business'))
print(porterstemmer.stem('bus'))
print(porterstemmer.stem('businesses'))
print(porterstemmer.stem('busy'))

busi
bu
busi
busi


# STEMMING

In [17]:
# from nltk.corpus import stopwords
import re
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z\-]", " ", str_input).lower().split()

    # prune words
#     def prune_food(w):
#         if w == 'bones' or w == 'bone' or w == 'fish' or w == 'worms' or w == 'worm':
#             w = 'food'
#         return w
    
#     words = [prune_food(word) for word in words]
    
    # stemming 
    porter_stemmer = PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    
    #remove non important words
    non_important = ['film', 'movie','apple', 'juice' ]
    words = [w for w in words if w not in non_important]
    
    #     stops = stopwords.words('english')
    #     words = [w for w in words if w not in stops]
    
    return words

**Note: tense, persons**

In [18]:
count_vectorizer = CountVectorizer(tokenizer=stemming_tokenizer)
x = count_vectorizer.fit_transform(x_train)
x_te = count_vectorizer.transform(x_test)
features = count_vectorizer.get_feature_names()

print('train set shape', x.shape)
print('test set shape', x_te.shape)


train set shape (2400, 3620)
test set shape (600, 3620)


In [19]:
print(count_vectorizer.get_feature_names())

['-', '--', '-drink', '-good', '-mi', '-year', 'a', 'abandon', 'abhor', 'abil', 'abl', 'abound', 'about', 'abov', 'abroad', 'absolut', 'absolutel', 'absolutley', 'abstrus', 'abysm', 'ac', 'academi', 'accent', 'accept', 'access', 'accessoryon', 'accid', 'accident', 'acclaim', 'accolad', 'accomod', 'accompani', 'accur', 'accus', 'ach', 'achiev', 'achil', 'ackerman', 'acknowledg', 'act', 'acting--even', 'acting-wis', 'action', 'activ', 'actor', 'actress', 'actual', 'ad', 'adapt', 'add', 'addit', 'adhes', 'admin', 'admit', 'ador', 'adrift', 'adventur', 'advertis', 'advis', 'aerial', 'aesthet', 'affect', 'affleck', 'afford', 'afraid', 'africa', 'after', 'afternoon', 'again', 'against', 'age', 'ago', 'agre', 'ahead', 'aimless', 'air', 'airlin', 'akin', 'ala', 'alarm', 'albondiga', 'alexand', 'alik', 'all', 'all-star', 'allot', 'allow', 'almond', 'almost', 'alon', 'along', 'alongsid', 'alot', 'alreadi', 'also', 'although', 'aluminum', 'alway', 'am', 'amateurish', 'amaz', 'amazingli', 'amazon'

In [20]:
pd.DataFrame(x.toarray(), columns=count_vectorizer.get_feature_names())

Unnamed: 0,-,--,-drink,-good,-mi,-year,a,abandon,abhor,abil,...,youtub,yucki,yukon,yum,yummi,yun,z,zero,zillion,zombi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2398,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF/IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tf_vectorizer = TfidfVectorizer(tokenizer=stemming_tokenizer, use_idf = True)
x = tf_vectorizer.fit_transform(x_train)
x_te = tf_vectorizer.transform(x_test)
features = tf_vectorizer.get_feature_names()
len(features)

3620

# Neuronetwork #

In [25]:
import time
from sklearn.neural_network import MLPClassifier

from matplotlib import pyplot as plt
import seaborn as sns

# from MLPClassifierWithSolverLBFGS import MLPClassifierLBFGS

# from viz_tools_for_binary_classifier import plot_pretty_probabilities_for_clf

%matplotlib inline

In [26]:
X = x.toarray()
y = y_train_df['is_positive_sentiment'].to_numpy()
feat_num = X.shape[1]

In [27]:
import sklearn.metrics

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score

### CV without best C

In [29]:
n_runs = 16
tr_classifierLOG_SGD = list()

for i in range(n_runs):
    start_time_sec = time.time()
    mlp_sgd = MLPClassifier(
        hidden_layer_sizes=[2],
        activation='logistic',
        alpha=0.0001,
        max_iter=400, tol=1e-8,
        random_state=i,
        solver='sgd', batch_size=10,
        learning_rate='adaptive', learning_rate_init=0.1, momentum=0.0,
        )
    with warnings.catch_warnings(record=True) as warn_list:
        clf = mlp_sgd.fit(X, y)
    mlp_sgd.did_converge = True if len(warn_list) == 0 else False
    elapsed_time_sec = time.time() - start_time_sec
    print('finished SGD run %2d/%d after %6.1f sec | %3d epochs | %s | loss %.3f' % (
        i+1, n_runs, elapsed_time_sec,
        len(mlp_sgd.loss_curve_),
            'converged    ' if mlp_sgd.did_converge else 'NOT converged',
            mlp_sgd.loss_))

    tr_classifierLOG_SGD.append(clf)

finished SGD run  1/16 after   30.0 sec | 400 epochs | NOT converged | loss 0.023
finished SGD run  2/16 after   30.9 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run  3/16 after   32.0 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run  4/16 after   32.5 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run  5/16 after   30.5 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run  6/16 after   32.1 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run  7/16 after   31.3 sec | 400 epochs | NOT converged | loss 0.021
finished SGD run  8/16 after   31.6 sec | 400 epochs | NOT converged | loss 0.023
finished SGD run  9/16 after   30.8 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run 10/16 after   30.7 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run 11/16 after   31.0 sec | 400 epochs | NOT converged | loss 0.022
finished SGD run 12/16 after   32.4 sec | 400 epochs | NOT converged | loss 0.023
finished SGD run

In [None]:
print("-----------------\nClassify with base data, 5 folds\n-----------------")

k = 3
kfold = KFold(n_splits=k)
train_scores = []
test_scores = []
# model = LogisticRegression()

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X[train_idx,:], X[test_idx,:]
    y_train, y_test = y[train_idx], y[test_idx]
    
    
    mlp_sgd = MLPClassifier(
            hidden_layer_sizes=[2],
            activation='logistic',
            alpha=0.0001,
            max_iter=400, tol=1e-8,
            random_state=i,
            solver='sgd', batch_size=10,
            learning_rate='adaptive', learning_rate_init=0.1, momentum=0.0,
            )
        with warnings.catch_warnings(record=True) as warn_list:
            clf = mlp_sgd.fit(X_train, y_train)
            pred_train = clf.predict_proba(X_train)
            pred_test = clf.predict_proba(X_test)
            
            score_train = clf.score(X_train, y_train)
            score_test = clf.score(X_test, y_test)
        mlp_sgd.did_converge = True if len(warn_list) == 0 else False
        elapsed_time_sec = time.time() - start_time_sec
        print('finished SGD run after %6.1f sec | %3d epochs | %s | loss %.3f' % (
              elapsed_time_sec,
            len(mlp_sgd.loss_curve_),
                'converged    ' if mlp_sgd.did_converge else 'NOT converged',
                mlp_sgd.loss_))

        tr_classifierLOG_SGD.append(clf)
    

 
    
    train_scores.append(score_train)
    test_scores.append(score_test)
    
print("\nAverage train accuracy: ", np.average(score_train))
print("Average test accuracy: ", np.average(score_test))


    n_runs = 16
    tr_classifierLOG_SGD = list()

    for i in range(n_runs):
        start_time_sec = time.time()
        mlp_sgd = MLPClassifier(
            hidden_layer_sizes=[2],
            activation='logistic',
            alpha=0.0001,
            max_iter=400, tol=1e-8,
            random_state=i,
            solver='sgd', batch_size=10,
            learning_rate='adaptive', learning_rate_init=0.1, momentum=0.0,
            )
        with warnings.catch_warnings(record=True) as warn_list:
            clf = mlp_sgd.fit(X, y)
        mlp_sgd.did_converge = True if len(warn_list) == 0 else False
        elapsed_time_sec = time.time() - start_time_sec
        print('finished SGD run %2d/%d after %6.1f sec | %3d epochs | %s | loss %.3f' % (
            i+1, n_runs, elapsed_time_sec,
            len(mlp_sgd.loss_curve_),
                'converged    ' if mlp_sgd.did_converge else 'NOT converged',
                mlp_sgd.loss_))

        tr_classifierLOG_SGD.append(clf)

### CV with the best C

In [None]:
C_grid = np.logspace(-9, 6, 31)
model_list = []
aver_train_score = []
aver_test_score = []
aver_train_loss = []
aver_test_loss = []


for C in C_grid:
    k = 3
    kfold = KFold(n_splits=k)
    
    train_scores = []
    test_scores = []
    train_loss = []
    test_loss = []

    model = sklearn.linear_model.LogisticRegression(C=C,solver='liblinear')
    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx,:], X[test_idx,:]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        pred_train = model.predict_proba(X_train)
        pred_test = model.predict_proba(X_test)

        # Score
        score_train = model.score(X_train, y_train)
        score_test = model.score(X_test, y_test)
#         print("Train score: ", score_train)
#         print("Test score: ", score_test)
        train_scores.append(score_train)
        test_scores.append(score_test)
        
        # Log loss
        log_loss_train = sklearn.metrics.log_loss(y_train,pred_train)
        log_loss_test = sklearn.metrics.log_loss(y_test,pred_test)
#         print("Train loss: ", log_loss_train)
#         print("Test loss: ", log_loss_test)
        train_loss.append(log_loss_train)
        test_loss.append(log_loss_test)
        
        
    print("\nFor C value : ", C)
    print("\nAverage train accuracy: ", np.average(score_train))
    print("Average test accuracy: ", np.average(score_test))
    print("\nAverage train loss: ", np.average(train_loss))
    print("Average test loss: ", np.average(test_loss))
    
    print('------------------------------------------------\n')
    
    model_list.append(model)
    aver_train_score.append(np.average(score_train))
    aver_test_score.append(np.average(score_test))
    aver_train_loss.append(np.average(train_loss))
    aver_test_loss.append(np.average(test_loss))

In [None]:
# print out the results in clear tabular format
pd.DataFrame(np.transpose([aver_train_score, aver_test_score, aver_train_loss, aver_test_loss]), columns=['train accuracy', 'test accuracy', 'train loss', 'test loss'])

#### Best Log loss

In [None]:
min_loss = min(aver_test_loss)
index_N2 = aver_test_loss.index(min_loss)
best_C =  C_grid[index_N2]
best_model = model_list[index_N2]

#### stability across Kfolds

In [None]:
print("-----------------\nClassify with base data, 5 folds\n-----------------")

K = [2,3,4,5,6,7,8,9,10,11,12]
K_train_loss = []
K_test_loss = []
for k in K:
    kfold = KFold(n_splits=k)
    train_scores = []
    test_scores = []
    train_loss = []
    test_loss = []
    
    for train_idx, test_idx in kfold.split(X):
        shuffler = np.random.permutation(len(X))
        X_shuffled = X[shuffler]
        y_shuffled = y[shuffler]
        X_train, X_test = X_shuffled[train_idx,:], X_shuffled[test_idx,:]
        y_train, y_test = y_shuffled[train_idx], y_shuffled[test_idx]

        model.fit(X_train, y_train)
        pred_train = best_model.predict_proba(X_train)
        pred_test = best_model.predict_proba(X_test)

        score_train = best_model.score(X_train, y_train)
        score_test = best_model.score(X_test, y_test)
        train_scores.append(score_train)
        test_scores.append(score_test)
        
        log_loss_train = sklearn.metrics.log_loss(y_train,pred_train)
        log_loss_test = sklearn.metrics.log_loss(y_test,pred_test)
        
        train_loss.append(log_loss_train)
        test_loss.append(log_loss_test)

    print("\nAverage train accuracy: ", np.average(score_train))
    print("Average test accuracy: ", np.average(score_test))
    print("Average train loss: ", np.average(train_loss))
    print("Average test loss: ", np.average(test_loss))
    
    K_train_loss.append(np.average(train_loss))
    K_test_loss.append(np.average(test_loss))

In [None]:
plt.xlabel('K from 2 to 12');
plt.ylabel('logistic loss');

sns.lineplot(x = K, y = K_train_loss, label = "Train Loss", color = "red", marker='o')
sns.lineplot(x = K, y = K_test_loss, label = "Test Loss", color = "blue", marker='o')

# show a legend on the plot 
plt.legend() 
plt.title('Log loss across K values')
plt.show()


print('standard deviation for training set: %.3f  ' %np.std(K_train_loss))
print('standard deviation for testing set: %.3f  ' %np.std(K_test_loss))

In [None]:
plt.xlabel('log10(C)');
plt.ylabel('logistic loss');
plt.ylim([0.0, 1]);

sns.lineplot(x = np.log10(C_grid), y = aver_train_loss, label = "Train Loss", color = "red", marker='o')
sns.lineplot(x = np.log10(C_grid), y = aver_test_loss,label = "Test Loss", color = "blue", marker='o')

# show a legend on the plot 
plt.legend() 
plt.title('Logistic loss on C-grid')
plt.show()

print("Best C-value for LR: %.3f" % best_C) 
print("Test set log-loss at best C-value: %.4f" % min_loss)

#### Result

In [None]:
yproba1_test = best_model.predict_proba(x_te)[:, 1] 
np.savetxt('yproba1_test.txt', yproba1_test)

# Neuronetwork

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
n_runs = 16
tr_classifierLBFS = list()

for i in range(n_runs):
    start_time_sec = time.time()
    mlp_lbfgs = MLPClassifier(
        hidden_layer_sizes=[2],
        activation='relu',
        alpha=0.0001,
        max_iter=200, tol=1e-6,
        random_state=i,
        )
    with warnings.catch_warnings(record=True) as warn_list:
        clf = mlp_lbfgs.fit(x_tr_N2, y_tr_N)
    elapsed_time_sec = time.time() - start_time_sec
    print('finished LBFGS run %2d/%d after %6.1f sec | %3d iters | %s | loss %.3f' % (
        i+1, n_runs, elapsed_time_sec,
        len(mlp_lbfgs.loss_curve_),
        'converged   ' if mlp_lbfgs.did_converge else 'NOT converged',
        mlp_lbfgs.loss_))
        
    tr_classifierLBFS.append(clf)