In [226]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import  LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB

In [114]:
%%time
df = pd.read_csv('train.csv')

CPU times: user 15.5 s, sys: 4.1 s, total: 19.6 s
Wall time: 21.5 s


In [115]:
df.head(1)

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,0,5019281,ADZPIG9QOCDG5,0,0,4.0,1203984000,good version of a classic,This is a charming version of the classic Dick...


# Exploratory Data Analyses

In [116]:
df.dropna(axis = 0, inplace= True)

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1397461 entries, 0 to 1697532
Data columns (total 9 columns):
Id                        1397461 non-null int64
ProductId                 1397461 non-null object
UserId                    1397461 non-null object
HelpfulnessNumerator      1397461 non-null int64
HelpfulnessDenominator    1397461 non-null int64
Score                     1397461 non-null float64
Time                      1397461 non-null int64
Summary                   1397461 non-null object
Text                      1397461 non-null object
dtypes: float64(1), int64(4), object(4)
memory usage: 106.6+ MB


In [118]:
df.describe()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,1397461.0,1397461.0,1397461.0,1397461.0,1397461.0
mean,849007.0,3.574046,5.305333,4.111471,1262467000.0
std,490194.9,17.77227,20.79026,1.196803,128947100.0
min,0.0,0.0,0.0,1.0,879379200.0
25%,424614.0,0.0,0.0,4.0,1164499000.0
50%,849193.0,1.0,1.0,5.0,1307750000.0
75%,1273606.0,3.0,5.0,5.0,1373242000.0
max,1697532.0,6084.0,6510.0,5.0,1406074000.0


In [119]:
df.nunique()

Id                        1397461
ProductId                   50050
UserId                     123958
HelpfulnessNumerator          599
HelpfulnessDenominator        654
Score                           5
Time                         5788
Summary                   1078989
Text                      1396636
dtype: int64

In [120]:
#ID is the same as index so we can safely drop this column
df.drop('Id', inplace= True, axis = 1)

In [121]:
df.head(1)

Unnamed: 0,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,5019281,ADZPIG9QOCDG5,0,0,4.0,1203984000,good version of a classic,This is a charming version of the classic Dick...


In [245]:
def feature_extraction_pipeline(df_train,df_test):
    
    def feature_extraction(df_train,df_test,text_feature):

        #Models and Transformations
        vectorizer = TfidfVectorizer(stop_words= 'english')
        model_svd = TruncatedSVD(n_components= 100)
        model_lr = LogisticRegression(solver= 'saga', multi_class= 'multinomial')

        #For training
        X = df_train[text_feature]
        X = vectorizer.fit_transform(X)
        
        #Reducing dimensions with SVD
        principalComponents = model_svd.fit_transform(X)
        df_svd = pd.DataFrame(principalComponents)

        X = df_svd.values
        y = df_train['Score'].values.reshape(-1)
        
        #Training model
        #Calculating probabilitiesfrom for Summary & Text
        model_lr.fit(X, y)
        predictions_probs = model_lr.predict_proba(X)
        p_df = pd.DataFrame(predictions_probs, columns=[f'p_{text_feature}_{x}' for x in range(1,6)])

        #For testing
        X_test = df_test[text_feature]
        X_test = vectorizer.transform(X_test)
        principalComponents_test = model_svd.fit_transform(X_test)
        df_svd_test = pd.DataFrame(principalComponents_test)

        X_test = df_svd_test.values

        #Predict Probability on test from train
        predictions_probs_test = model_lr.predict_proba(X_test)

        p_df_test = pd.DataFrame(predictions_probs_test, columns=[f'p_{text_feature}_{x}' for x in range(1,6)])

        return p_df, p_df_test

#     def fit_classifier(train_df):

#         y = train_df['Score'].values.ravel()
#         X = train_df.drop('Score', axis = 1)
        
#         clf = LogisticRegression(solver= 'saga', multi_class= 'multinomial')
#         clf.fit(X,y)
    
#         return clf

    train_p_s, test_p_s = feature_extraction(df_train,df_test,'Summary')
    train_p_t, test_p_t = feature_extraction(df_train,df_test,'Text')
    
    print('Completed Feature Extraction')
    
    df_train.reset_index(drop = True, inplace = True)
    train_df = pd.concat([df_train, train_p_s,train_p_t], axis = 1)
    train_df = train_df.drop(['ProductId','UserId','Time' , 'Summary' , 'Text'], axis = 1)
    
    df_test.reset_index(drop = True, inplace = True)
    test_df = pd.concat([df_test, test_p_s,test_p_t], axis = 1)
    test_df = test_df.drop(['ProductId','UserId', 'Time' , 'Summary' , 'Text'], axis = 1)
    
    print('Completing Selecting Relevant Features')
    
    scaler = MinMaxScaler()
    
    train_df[['HelpfulnessNumerator', 'HelpfulnessDenominator']] = scaler.fit_transform(train_df[['HelpfulnessNumerator','HelpfulnessDenominator']])
    test_df[['HelpfulnessNumerator', 'HelpfulnessDenominator']] = scaler.fit_transform(test_df[['HelpfulnessNumerator','HelpfulnessDenominator']])
    
    print('Completed Normalizing Features')
    
    #clf = fit_classifier(train_df, test_df)
    
    print('Completed Fitting Model')
    
    return train_df, test_df
    
  
    

In [208]:
sample = df.sample(frac=0.7, random_state=1)

In [209]:
sample_test = df.sample(frac=0.3, random_state=2)

In [199]:
%%time
train_df, test_df = feature_extraction_pipeline(sample,sample_test)

Completed Feature Extraction
Completing Selecting Relevant Features
Completed Normalizing Features
CPU times: user 4min 33s, sys: 17 s, total: 4min 50s
Wall time: 3min 29s


In [201]:
X = train_df.drop('Score', axis = 1)
y = train_df['Score']

In [202]:
model_blah = LogisticRegression(multi_class= 'multinomial', solver= 'saga')

In [203]:
model_blah.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [204]:
pred_blah =  model_blah.predict(test_df.drop('Score', axis = 1))

In [206]:
accuracy_score(sample_test['Score'], pred_blah)

0.5041718546505803

In [212]:
%%time
train_df, test_df = feature_extraction_pipeline(sample,sample_test)

Completed Feature Extraction
Completing Selecting Relevant Features
Completed Normalizing Features
Completed Fitting Model
CPU times: user 11min 2s, sys: 59.2 s, total: 12min 1s
Wall time: 9min 44s


In [None]:
%%time
pred_blah_2 =  model_blah.predict(test_df.drop('Score', axis = 1))

In [213]:
train_df.shape

(978223, 13)

In [217]:
train_df.head(1)

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Score,p_Summary_1,p_Summary_2,p_Summary_3,p_Summary_4,p_Summary_5,p_Text_1,p_Text_2,p_Text_3,p_Text_4,p_Text_5
0,0.0,0.0,5.0,0.004008,0.018084,0.070328,0.214824,0.692756,0.002506,0.00485,0.016805,0.122925,0.852914


In [223]:
test_df.head(1)

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Score,p_Summary_1,p_Summary_2,p_Summary_3,p_Summary_4,p_Summary_5,p_Text_1,p_Text_2,p_Text_3,p_Text_4,p_Text_5
0,0.000646,0.000641,3.0,0.083122,0.080659,0.132423,0.224176,0.479621,0.028225,0.043527,0.159619,0.234061,0.534566


In [214]:
test_df.shape

(419238, 13)

In [216]:
model_nb = MultinomialNB()

In [220]:
model_nb.fit(train_df.drop('Score', axis=1), train_df['Score'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [224]:
predictions_nb = model_nb.predict(test_df.drop('Score',axis = 1))

In [225]:
accuracy_score(test_df['Score'], predictions_nb)

0.5334487808834123

In [227]:
## ADAboost
model_ada = AdaBoostClassifier()

In [228]:
model_ada.fit(train_df.drop('Score', axis=1), train_df['Score'])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [231]:
predictions_ada = model_ada.predict(test_df.drop('Score',axis = 1))

In [232]:
accuracy_score(test_df['Score'], predictions_ada)

0.5508684804335485

In [233]:
hyper =  {'n_estimators': [50, 100],'learning_rate' : [0.01,0.05,0.1,0.3,1]}

In [234]:
model_gs = GridSearchCV(model_ada, param_grid=hyper, cv = 3, scoring= 'accuracy')

In [235]:
model_gs.fit(train_df.drop('Score', axis=1), train_df['Score'])

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='warn', n_jobs=None,
             param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.3, 1],
                         'n_estimators': [50, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [236]:
model_gs.best_estimator_

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=100, random_state=None)

In [238]:
model_gs.best_score_

0.6153269755464756

In [237]:
pd.DataFrame(model_gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,72.210898,0.336992,1.767452,0.03234,0.01,50,"{'learning_rate': 0.01, 'n_estimators': 50}",0.53409,0.534093,0.534095,0.534093,2e-06,9
1,142.330389,1.686906,3.486975,0.076474,0.01,100,"{'learning_rate': 0.01, 'n_estimators': 100}",0.53409,0.534093,0.534095,0.534093,2e-06,9
2,72.231627,0.51462,1.75718,0.011791,0.05,50,"{'learning_rate': 0.05, 'n_estimators': 50}",0.550313,0.550602,0.551349,0.550755,0.000436,8
3,135.57635,2.645688,3.626596,0.105392,0.05,100,"{'learning_rate': 0.05, 'n_estimators': 100}",0.576117,0.575688,0.575834,0.57588,0.000178,7
4,69.051101,0.921068,1.846062,0.058767,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",0.577457,0.576271,0.580419,0.578049,0.001744,6
5,139.456295,1.571639,3.541262,0.155054,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",0.603258,0.602676,0.60215,0.602695,0.000452,5
6,68.269967,1.872594,1.99206,0.021521,0.3,50,"{'learning_rate': 0.3, 'n_estimators': 50}",0.607349,0.609264,0.606784,0.607799,0.001061,4
7,136.824897,2.020004,3.448603,0.028958,0.3,100,"{'learning_rate': 0.3, 'n_estimators': 100}",0.61279,0.613695,0.611059,0.612515,0.001093,2
8,70.20323,2.718723,1.773666,0.027519,1.0,50,"{'learning_rate': 1, 'n_estimators': 50}",0.610692,0.612064,0.609572,0.610776,0.001019,3
9,138.215528,0.975401,3.539376,0.083573,1.0,100,"{'learning_rate': 1, 'n_estimators': 100}",0.615525,0.61583,0.614626,0.615327,0.000511,1


In [241]:
hyper2 =  {'n_estimators': [100, 150],'learning_rate' : [1, 1.5, 2]}

In [242]:
model_gs2 = GridSearchCV(model_ada, param_grid=hyper2, cv = 3, scoring= 'accuracy')

In [243]:
%%time
model_gs2.fit(train_df.drop('Score', axis=1), train_df['Score'])

CPU times: user 1h 23min 51s, sys: 1min 37s, total: 1h 25min 28s
Wall time: 57min 17s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='warn', n_jobs=None,
             param_grid={'learning_rate': [1, 1.5, 2],
                         'n_estimators': [100, 150]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [244]:
pd.DataFrame(model_gs2.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,130.637822,1.379437,3.857869,0.018148,1.0,100,"{'learning_rate': 1, 'n_estimators': 100}",0.615525,0.61583,0.614626,0.615327,0.000511,3
1,201.376422,5.802723,6.169917,0.32886,1.0,150,"{'learning_rate': 1, 'n_estimators': 150}",0.616795,0.617354,0.616129,0.616759,0.000501,1
2,133.98542,2.545582,3.876928,0.098934,1.5,100,"{'learning_rate': 1.5, 'n_estimators': 100}",0.614709,0.614928,0.612525,0.614054,0.001085,4
3,196.042651,2.29528,5.605462,0.123512,1.5,150,"{'learning_rate': 1.5, 'n_estimators': 150}",0.616826,0.617627,0.614642,0.616365,0.001262,2
4,131.213131,1.826281,3.784638,0.002712,2.0,100,"{'learning_rate': 2, 'n_estimators': 100}",0.424662,0.397787,0.401241,0.407897,0.011938,5
5,203.764175,4.972128,5.486059,0.161598,2.0,150,"{'learning_rate': 2, 'n_estimators': 150}",0.423702,0.397787,0.399174,0.406888,0.011903,6
