<h2>Importing libraries</h2>

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

<H2>Feature Selection</H2>

In [2]:
y_train=pd.read_csv("review_meta_train.csv")["rating"]
vote_funny=pd.read_csv("review_meta_train.csv")["vote_funny"]
vote_cool=pd.read_csv("review_meta_train.csv")["vote_cool"]
vote_useful=pd.read_csv("review_meta_train.csv")["vote_useful"]

In [3]:
'''
Calculate the average accuracies of 3 different feature representations: with 50, 100, and 200 features using 3 different
classifiers. Choose the data file with highest average accuracy to be used in the next steps
'''

list_of_filenames = ["review_text_train_doc2vec50.csv", 
                   "review_text_train_doc2vec100.csv",
                   "review_text_train_doc2vec200.csv"]
list_of_Xs = [pd.read_csv(i, header=None) for i in list_of_filenames]
list_of_clfs = [RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=2),
              DummyClassifier(strategy="most_frequent"),
              svm.LinearSVC(max_iter=10000)]
data_objects = [train_test_split(X, y_train, test_size=0.2, random_state=1) for X in list_of_Xs]
list_of_X_train = [i[0] for i in data_objects]
list_of_X_test = [i[1] for i in data_objects]
list_of_y_train = [i[2] for i in data_objects]
list_of_y_test = [i[3] for i in data_objects]

In [4]:
for data_file_number in range(len(list_of_filenames)):
    print("Data file number:", data_file_number)
    current_total_accuracy = 0
    for clf in list_of_clfs:
        clf.fit(list_of_X_train[data_file_number], list_of_y_train[data_file_number])
        predictions = clf.predict(list_of_X_test[data_file_number])
        accuracy = accuracy_score(predictions, list_of_y_test[0])
        print(clf, "Accuracy:", accuracy)
        current_total_accuracy += accuracy        
    average_accuracy = current_total_accuracy/len(list_of_clfs)
    print("File:", list_of_filenames[data_file_number])
    print("Average accuracy:", average_accuracy)

Data file number: 0
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=2, verbose=0,
                       warm_start=False) Accuracy: 0.7703954399714998
DummyClassifier(constant=None, random_state=None, strategy='most_frequent') Accuracy: 0.6841824011400072




LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) Accuracy: 0.8138582116138225
File: review_text_train_doc2vec50.csv
Average accuracy: 0.7561453509084433
Data file number: 1
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=2, verbose=0,
                       warm_start=False) Accuracy: 0.7452796579978624
DummyClassifier(constant=None, random_state=None, strategy='most_frequent'



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) Accuracy: 0.8192019950124688
File: review_text_train_doc2vec100.csv
Average accuracy: 0.7495546847167796
Data file number: 2
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=2, verbose=0,
                       warm_start=False) Accuracy: 0.7249732810830067
DummyClassifier(constant=None, random_state=None, strategy='most_frequent



We now know that the doc2vec representation with 50 features is better than the ones with 100 and 200 features, at least on preliminary classifiers which have not been carefully chosen or tuned. Now, let's see if adding the metadata (vote_funny, vote_cool, and vote_useful) helps the prediction.

In [5]:
df_with_metadata=pd.read_csv("review_text_train_doc2vec50.csv", header=None)
df_with_metadata["vote_funny"]=vote_funny
df_with_metadata["vote_cool"]=vote_cool
df_with_metadata["vote_useful"]=vote_useful

X=df_with_metadata

X_train, X_test, y_train, y_test=train_test_split(X, y_train, test_size=0.2, random_state=1)

list_of_clfs = [RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=2),
                DummyClassifier(strategy="most_frequent"),
              svm.LinearSVC(max_iter=10000)]

current_total_accuracy = 0
for clf in list_of_clfs:
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    accuracy = accuracy_score(predictions, y_test)
    print(clf, "Accuracy:", accuracy)
    current_total_accuracy += accuracy        
average_accuracy = current_total_accuracy/len(list_of_clfs)
print("Average accuracy:", average_accuracy)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=2, verbose=0,
                       warm_start=False) Accuracy: 0.7682579266120413
DummyClassifier(constant=None, random_state=None, strategy='most_frequent') Accuracy: 0.6841824011400072
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) Accuracy: 0.8163519771998575
Average accuracy: 0.7562641016506353




The difference is very small, but adding these features indeed result in improved performance. So we will use those features as well.

<H2>Use the Doc2Vec representation with 50 features and vote information</H2>

<H3>Work with training set and validation set</H3>

In [6]:
#Compile necessary elements and separate the class label
df=pd.read_csv("review_text_train_doc2vec50.csv", header=None)
y_train=pd.read_csv("review_meta_train.csv")["rating"]
vote_funny=pd.read_csv("review_meta_train.csv")["vote_funny"]
vote_cool=pd.read_csv("review_meta_train.csv")["vote_cool"]
vote_useful=pd.read_csv("review_meta_train.csv")["vote_useful"]

df["class_label"] = y_train
df["vote_funny"]=vote_funny
df["vote_cool"]=vote_cool
df["vote_useful"]=vote_useful

X = df.drop(columns=["class_label"])
y = df["class_label"]


"""
Split the dataset into train and validation set
"""
df_train = df.sample(frac=0.8, replace=False, random_state=1)
X_train = df_train.drop(columns=["class_label"])
y_train = df_train["class_label"]

df_val = df.drop(labels=df_train.index, axis='index')
X_val = df_val.drop(columns=["class_label"])
y_val = df_val["class_label"]

<H4>Accuracy scores of different classifiers</H4>

In [7]:
def train_predict(classifier, X_train, y_train, X_val, y_val):
    """
    Fit a model using X_train and y_train
    Predictions based on X_val
    Returns the predictions
    """
    
    classifier.fit(X_train, y_train)
    return classifier.predict(X_val)

<h4>Perform Grid Search to find the best parameters for Random Forest </h4>

In [8]:
parameters={"n_estimators": [50, 100, 200, 300],
            "max_depth": [2, 5, 10, 25, 50], 
            "criterion": ["entropy", "gini"],
            "random_state": [2]}
clf=GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)
clf.best_params_

{'criterion': 'gini', 'max_depth': 50, 'n_estimators': 300, 'random_state': 2}

<h4>Find validation set and training set accuracy</h4>

In [9]:
clf1 = RandomForestClassifier(n_estimators=300, max_depth=25, n_jobs=-1, random_state=2)  #GINI is already by default
RF_predictions= train_predict(clf1, X_train, y_train, X_val, y_val)
RF_Acc = accuracy_score(RF_predictions, y_val)
RF_Acc    #Validation set accuracy

0.7760954755967224

In [10]:
accuracy_score(clf1.predict(X_train), y_train)    #Training set accuracy

1.0

<h4>Find 0-R performance on the training and testing sets</h4>

In [11]:
clf3=DummyClassifier(strategy="most_frequent")
clf3.fit(X_train, y_train)
accuracy_score(clf3.predict(X_train), y_train)   #Training accuracy

0.6886968914224637

In [12]:
accuracy_score(clf3.predict(X_val), y_val)    #Validation accuracy

0.6811542572141076

<h4>Perform Grid Search to find the best parameters for SVM </h4>

In [13]:
parameters={"C": [0.1, 1, 10, 100],
           "max_iter": [100, 1000, 10000]}
clf=GridSearchCV(estimator=svm.SVC(), param_grid=parameters, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)



GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100],
                         'max_iter': [100, 1000, 10000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [14]:
clf.best_params_

{'C': 1, 'max_iter': 10000}

<h4>Find validation set and training set accuracy</h4>

In [15]:
clf4 = svm.LinearSVC(C=1, max_iter=10000)
SVC_predictions = train_predict(clf4, X_train, y_train, X_val, y_val)
SVC_Acc = accuracy_score(SVC_predictions, y_val)
SVC_Acc    #Validation set accuracy



0.8101175632347702

In [16]:
accuracy_score(clf4.predict(X_train), y_train)    #Training accuracy

0.8194976396187762

<H2>Stacking (2 Fold)</H2>

In [17]:
"""
Split the train set into halves
"""

df_train_first_half=df_train.sample(frac=0.5)
df_train_second_half=df_train.drop(labels=df_train_first_half.index, axis='index')

X_train_first_half = df_train_first_half.drop(columns=["class_label"])
y_train_first_half = df_train_first_half["class_label"]

X_train_second_half = df_train_second_half.drop(columns=["class_label"])
y_train_second_half = df_train_second_half["class_label"]

<H4>Using different base classifiers, fit the model using one half of the train set, and predict using the other half of the train set. The results of the prediction will become the training set for the stacker</H4>

Random Forest (using parameters found via grid search)

In [18]:
df_for_stacker = pd.DataFrame()

base_clf1 = RandomForestClassifier(n_estimators=300, max_depth=25, n_jobs=-1, random_state=2)
base_clf1.fit(X_train_first_half, y_train_first_half)
rf_base_predictions = pd.DataFrame(base_clf1.predict_proba(X_train_second_half))
df_for_stacker["RF_1Star"] = rf_base_predictions[0]
df_for_stacker["RF_3Star"] = rf_base_predictions[1]
df_for_stacker["RF_5Star"] = rf_base_predictions[2]

In [19]:
df_for_stacker2 = pd.DataFrame()

base_clf1 = RandomForestClassifier(n_estimators=300, max_depth=25, n_jobs=-1, random_state=2)
base_clf1.fit(X_train_second_half, y_train_second_half)
rf_base_predictions = pd.DataFrame(base_clf1.predict_proba(X_train_first_half))
df_for_stacker2["RF_1Star"] = rf_base_predictions[0]
df_for_stacker2["RF_3Star"] = rf_base_predictions[1]
df_for_stacker2["RF_5Star"] = rf_base_predictions[2]

Naive Bayes and Neural Network. No hyperparameter tuning is done here as we are not going to be analysing these systems; these are simply minor components of the stacking classifier. For the neural network, we simply guessed what might be effective hyperparameter values.

In [20]:
base_clf2 = GaussianNB()
base_clf2.fit(X_train_first_half, y_train_first_half)
gnb_base_predictions = pd.DataFrame(base_clf2.predict_proba(X_train_second_half))
df_for_stacker["GNB_1Star"] = gnb_base_predictions[0]
df_for_stacker["GNB_3Star"] = gnb_base_predictions[1]
df_for_stacker["GNB_5Star"] = gnb_base_predictions[2]

In [21]:
base_clf2 = GaussianNB()
base_clf2.fit(X_train_second_half, y_train_second_half)
gnb_base_predictions = pd.DataFrame(base_clf2.predict_proba(X_train_first_half))
df_for_stacker2["GNB_1Star"] = gnb_base_predictions[0]
df_for_stacker2["GNB_3Star"] = gnb_base_predictions[1]
df_for_stacker2["GNB_5Star"] = gnb_base_predictions[2]

In [22]:
base_clf3 = MLPClassifier(hidden_layer_sizes=(100, 10), max_iter=1000)
base_clf3.fit(X_train_first_half, y_train_first_half)
nn_base_predictions = pd.DataFrame(base_clf3.predict_proba(X_train_second_half))
df_for_stacker["NN_1Star"] = nn_base_predictions[0]
df_for_stacker["NN_3Star"] = nn_base_predictions[1]
df_for_stacker["NN_5Star"] = nn_base_predictions[2]

In [23]:
base_clf3 = MLPClassifier(hidden_layer_sizes=(100, 10), max_iter=1000)
base_clf3.fit(X_train_second_half, y_train_second_half)
nn_base_predictions = pd.DataFrame(base_clf3.predict_proba(X_train_first_half))
df_for_stacker2["NN_1Star"] = nn_base_predictions[0]
df_for_stacker2["NN_3Star"] = nn_base_predictions[1]
df_for_stacker2["NN_5Star"] = nn_base_predictions[2]

SVM (using parameters found via grid search)

In [24]:
base_clf4 = svm.LinearSVC(C=1, max_iter=10000)
base_clf4.fit(X_train_first_half, y_train_first_half)
svm_base_predictions = pd.DataFrame(base_clf4.decision_function(X_train_second_half))
df_for_stacker["SVM_1Star"] = svm_base_predictions[0]
df_for_stacker["SVM_3Star"] = svm_base_predictions[1]
df_for_stacker["SVM_5Star"] = svm_base_predictions[2]



In [25]:
base_clf4 = svm.LinearSVC(C=1, max_iter=10000)
base_clf4.fit(X_train_second_half, y_train_second_half)
svm_base_predictions = pd.DataFrame(base_clf4.decision_function(X_train_first_half))
df_for_stacker2["SVM_1Star"] = svm_base_predictions[0]
df_for_stacker2["SVM_3Star"] = svm_base_predictions[1]
df_for_stacker2["SVM_5Star"] = svm_base_predictions[2]



<H4>Meta classifier</H4>
<H5>Grid Search</H5>

In [26]:
parameters={"C": [0.1, 1, 10, 20],
           "max_iter": [100, 500, 1000, 10000],
            "penalty": ['l1', 'l2'],
           "random_state": [1]}
clf=GridSearchCV(estimator=LogisticRegression(), param_grid=parameters, n_jobs=-1, cv=5)
clf.fit(pd.concat([df_for_stacker, df_for_stacker2]), pd.concat([y_train_second_half, y_train_first_half]))
clf.best_params_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'C': 1, 'max_iter': 100, 'penalty': 'l2', 'random_state': 1}

Finding training accuracy

In [28]:
meta_clf = LogisticRegression(C=1, penalty='l2', max_iter=100, random_state=1)
meta_clf.fit(pd.concat([df_for_stacker, df_for_stacker2]), pd.concat([y_train_second_half, y_train_first_half]))
meta_prediction = meta_clf.predict(pd.concat([df_for_stacker, df_for_stacker2]))
accuracy_score(meta_prediction, pd.concat([y_train_second_half, y_train_first_half]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8251981829518126

<H4>Generate validation set for the meta classifier from base classifiers predictions</H4>

In [29]:
df_val_for_stacker = pd.DataFrame()

rf_base_predictions_val = pd.DataFrame(base_clf1.predict_proba(X_val))
df_val_for_stacker["RF_1Star"] = rf_base_predictions_val[0]
df_val_for_stacker["RF_3Star"] = rf_base_predictions_val[1]
df_val_for_stacker["RF_5Star"] = rf_base_predictions_val[2]

gnb_base_predictions_val = pd.DataFrame(base_clf2.predict_proba(X_val))
df_val_for_stacker["GNB_1Star"] = gnb_base_predictions_val[0]
df_val_for_stacker["GNB_3Star"] = gnb_base_predictions_val[1]
df_val_for_stacker["GNB_5Star"] = gnb_base_predictions_val[2]

nn_base_predictions_val = pd.DataFrame(base_clf3.predict_proba(X_val))
df_val_for_stacker["NN_1Star"] = nn_base_predictions_val[0]
df_val_for_stacker["NN_3Star"] = nn_base_predictions_val[1]
df_val_for_stacker["NN_5Star"] = nn_base_predictions_val[2]

svm_base_predictions_val = pd.DataFrame(base_clf4.decision_function(X_val))
df_val_for_stacker["SVM_1Star"] = svm_base_predictions_val[0]
df_val_for_stacker["SVM_3Star"] = svm_base_predictions_val[1]
df_val_for_stacker["SVM_5Star"] = svm_base_predictions_val[2]

<H4>Meta classifier performance on validation set</H4>

In [53]:
meta_predictions = meta_clf.predict(df_val_for_stacker)
accuracy_score(meta_predictions, y_val)   #Validation accuracy

0.8170644816530104

<H2>Repeat the process to get the predictions for the test set</H2>

<H3>Work with (complete) train set and test set, without validation set</H3>

In [31]:
X_test = pd.read_csv("review_text_test_doc2vec50.csv", header=None)

vote_funny=pd.read_csv("review_meta_train.csv")["vote_funny"]
vote_cool=pd.read_csv("review_meta_train.csv")["vote_cool"]
vote_useful=pd.read_csv("review_meta_train.csv")["vote_useful"]
y_train=pd.read_csv("review_meta_train.csv")["rating"]

df["class_label"] = y_train
df["vote_funny"]=vote_funny
df["vote_cool"]=vote_cool
df["vote_useful"]=vote_useful

vote_funny_test=pd.read_csv("review_meta_test.csv")["vote_funny"]
vote_cool_test=pd.read_csv("review_meta_test.csv")["vote_cool"]
vote_useful_test=pd.read_csv("review_meta_test.csv")["vote_useful"]

X_test["vote_funny"]=vote_funny_test
X_test["vote_cool"]=vote_cool_test
X_test["vote_useful"]=vote_useful_test

X = df.drop(columns=["class_label"])
y = df["class_label"]

In [32]:
"""
Split the train set into 2 halves
"""
df_first_half = df.sample(frac=0.5)
X_train_first_half = df_first_half.drop(columns=["class_label"])
y_train_first_half = df_first_half["class_label"]

df_second_half = df.drop(labels=df_first_half.index, axis='index')
X_train_second_half = df_second_half.drop(columns=["class_label"])
y_train_second_half = df_second_half["class_label"]

<H4>Create predictions using different classifiers</H4>

In [33]:
clf1 = RandomForestClassifier(n_estimators=300, max_depth=25, n_jobs=-1, random_state=2)
clf1.fit(X, y)
test_predictions = clf1.predict(X_test)
RF = pd.DataFrame(test_predictions)
RF.index=RF.index+1
RF=RF.reset_index()
RF.columns=["Instance_id", "rating"]
RF.to_csv("Random_Forest_predictions.csv", index=False)

In [34]:
clf2 = DummyClassifier(strategy="most_frequent")
clf2.fit(X, y)
test_predictions = clf2.predict(X_test)
zeroR = pd.DataFrame(test_predictions)
zeroR.index=zeroR.index+1
zeroR=zeroR.reset_index()
zeroR.columns=["Instance_id", "rating"]
zeroR.to_csv("Zero_R_predictions.csv", index=False)

In [35]:
clf3 = svm.LinearSVC(C=1, max_iter=10000)
clf3.fit(X, y)
test_predictions = clf3.predict(X_test)
svc = pd.DataFrame(test_predictions)
svc.index=svc.index+1
svc=svc.reset_index()
svc.columns=["Instance_id", "rating"]
svc.to_csv("Linear_SVC_predictions.csv", index=False)



<H4>Using different base classifiers, fit the model with one half of the train set, and predict using the other half of the train set. Then repeat, switching the two halves, thus generating the train set for the stacking classifier</H4>

Random Forest and SVM used the hyperparameters found via grid search as before, but Naive Bayes and neural network will not be tuned (and will instead just use hyperparameters that seem reasonable) as we will not analyse these models in our report

In [36]:
df_train_for_stacker = pd.DataFrame()
clf1 = RandomForestClassifier(n_estimators=300, max_depth=25, n_jobs=-1, random_state=2)
clf1.fit(X_train_first_half, y_train_first_half)
rf_base_predictions=pd.DataFrame(clf1.predict_proba(X_train_second_half))
df_train_for_stacker["RF_1Star"] = rf_base_predictions[0]
df_train_for_stacker["RF_3Star"] = rf_base_predictions[1]
df_train_for_stacker["RF_5Star"] = rf_base_predictions[2]

In [37]:
df_train_for_stacker2 = pd.DataFrame()
clf1 = RandomForestClassifier(n_estimators=300, max_depth=25, n_jobs=-1, random_state=2)
clf1.fit(X_train_second_half, y_train_second_half)
rf_base_predictions=pd.DataFrame(clf1.predict_proba(X_train_first_half))
df_train_for_stacker2["RF_1Star"] = rf_base_predictions[0]
df_train_for_stacker2["RF_3Star"] = rf_base_predictions[1]
df_train_for_stacker2["RF_5Star"] = rf_base_predictions[2]

In [38]:
clf2 = GaussianNB()
clf2.fit(X_train_first_half, y_train_first_half)
gnb_base_predictions = pd.DataFrame(clf2.predict_proba(X_train_second_half))
df_train_for_stacker["GNB_1Star"] = gnb_base_predictions[0]
df_train_for_stacker["GNB_3Star"] = gnb_base_predictions[1]
df_train_for_stacker["GNB_5Star"] = gnb_base_predictions[2]

In [39]:
clf2 = GaussianNB()
clf2.fit(X_train_second_half, y_train_second_half)
gnb_base_predictions = pd.DataFrame(clf2.predict_proba(X_train_first_half))
df_train_for_stacker2["GNB_1Star"] = gnb_base_predictions[0]
df_train_for_stacker2["GNB_3Star"] = gnb_base_predictions[1]
df_train_for_stacker2["GNB_5Star"] = gnb_base_predictions[2]

In [40]:
clf3 = MLPClassifier(hidden_layer_sizes=(100, 10), max_iter=1000)
clf3.fit(X_train_first_half, y_train_first_half)
nn_base_predictions = pd.DataFrame(clf3.predict_proba(X_train_second_half))
df_train_for_stacker["NN_1Star"] = nn_base_predictions[0]
df_train_for_stacker["NN_3Star"] = nn_base_predictions[1]
df_train_for_stacker["NN_5Star"] = nn_base_predictions[2]

In [41]:
clf3 = MLPClassifier(hidden_layer_sizes=(100, 10), max_iter=1000)
clf3.fit(X_train_second_half, y_train_second_half)
nn_base_predictions = pd.DataFrame(clf3.predict_proba(X_train_first_half))
df_train_for_stacker2["NN_1Star"] = nn_base_predictions[0]
df_train_for_stacker2["NN_3Star"] = nn_base_predictions[1]
df_train_for_stacker2["NN_5Star"] = nn_base_predictions[2]

In [42]:
clf4 = svm.LinearSVC(C=1, max_iter=10000)
clf4.fit(X_train_first_half, y_train_first_half)
svm_base_predictions = pd.DataFrame(clf4.decision_function(X_train_second_half))
df_train_for_stacker["SVM_1Star"] = svm_base_predictions[0]
df_train_for_stacker["SVM_3Star"] = svm_base_predictions[1]
df_train_for_stacker["SVM_5Star"] = svm_base_predictions[2]



In [43]:
clf4 = svm.LinearSVC(C=1, max_iter=10000)
clf4.fit(X_train_second_half, y_train_second_half)
svm_base_predictions = pd.DataFrame(clf4.decision_function(X_train_first_half))
df_train_for_stacker2["SVM_1Star"] = svm_base_predictions[0]
df_train_for_stacker2["SVM_3Star"] = svm_base_predictions[1]
df_train_for_stacker2["SVM_5Star"] = svm_base_predictions[2]



<H4>Generate attributes for the meta classifier from base classifier predictions using the test set</H4>

In [44]:
df_test_for_stacker = pd.DataFrame()

rf_base_predictions_test = pd.DataFrame(clf1.predict_proba(X_test))
df_test_for_stacker["RF_1Star"] = rf_base_predictions_test[0]
df_test_for_stacker["RF_3Star"] = rf_base_predictions_test[1]
df_test_for_stacker["RF_5Star"] = rf_base_predictions_test[2]

gnb_base_predictions_test = pd.DataFrame(clf2.predict_proba(X_test))
df_test_for_stacker["GNB_1Star"] = gnb_base_predictions_test[0]
df_test_for_stacker["GNB_3Star"] = gnb_base_predictions_test[1]
df_test_for_stacker["GNB_5Star"] = gnb_base_predictions_test[2]

nn_base_predictions_test = pd.DataFrame(clf3.predict_proba(X_test))
df_test_for_stacker["NN_1Star"] = nn_base_predictions_test[0]
df_test_for_stacker["NN_3Star"] = nn_base_predictions_test[1]
df_test_for_stacker["NN_5Star"] = nn_base_predictions_test[2]

svm_base_predictions_test = pd.DataFrame(clf4.decision_function(X_test))
df_test_for_stacker["SVM_1Star"] = svm_base_predictions_test[0]
df_test_for_stacker["SVM_3Star"] = svm_base_predictions_test[1]
df_test_for_stacker["SVM_5Star"] = svm_base_predictions_test[2]

<H4>Meta classifier</H4>
<H5>On test set</H5>

In [46]:
meta_clf_for_test = LogisticRegression(C=1, penalty='l2', max_iter=100, random_state=1)
meta_clf_for_test.fit(pd.concat([df_train_for_stacker, df_train_for_stacker2]), pd.concat([y_train_second_half, y_train_first_half]))
meta_prediction = meta_clf_for_test.predict(df_test_for_stacker)
meta = pd.DataFrame(meta_prediction)
meta.index=meta.index+1
meta=meta.reset_index()
meta.columns=["Instance_id", "rating"]
meta.to_csv("meta_clf_for_test_predictions.csv", index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


<h2>Error Analysis</h2>

In [54]:
confusion_matrix(y_val, meta_predictions)
#Will change between runs

array([[ 235,  143,   97],
       [  56,  769,  490],
       [  23,  218, 3583]], dtype=int64)

In [55]:
confusion_matrix(y_val, RF_predictions)
#Will change between runs

array([[ 127,  108,  240],
       [  17,  466,  832],
       [   5,   55, 3764]], dtype=int64)

In [56]:
confusion_matrix(y_val, SVC_predictions)
#Will change between runs

array([[ 204,  133,  138],
       [  53,  708,  554],
       [  21,  167, 3636]], dtype=int64)

In [57]:
Counter(y_val)

Counter({3: 1315, 5: 3824, 1: 475})

In [58]:
Counter(meta_prediction)
#bias toward 5 star

Counter({5: 5119, 1: 476, 3: 1423})

In [59]:
Counter(RF_predictions)
#bias toward 5 star

Counter({5: 4836, 3: 629, 1: 149})

In [60]:
Counter(SVC_predictions)
#bias toward 5 star

Counter({5: 4328, 3: 1008, 1: 278})