In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , VotingClassifier , GradientBoostingClassifier , AdaBoostClassifier

In [None]:
train_data_1 = pd.read_csv("../input/datacontest1/Dataset_1_Training.csv")
test_data_1 = pd.read_csv("../input/datacontest1/Dataset_1_Testing.csv")
train_data_2 = pd.read_csv("../input/datacontest1/Dataset_2_Training.csv")
test_data_2 = pd.read_csv("../input/datacontest1/Dataset_2_Training.csv")
# Read co1 and co2
co1 = train_data_1.tail()[-2:-1]
co2 = train_data_1.tail()[-1:]

# Remove the labels for co1, co2
co1.pop('ID_REF')
co2.pop('ID_REF')

# convert co1, co2 into arrays
co1 = np.array(co1)
co2 = np.array(co2)

# Transpose the data set with rows as training numbers
train_data_1 = train_data_1[:-2]
train_data_1 = train_data_1.T

test_data_1 = test_data_1.T

# Make the first row of dataframe as columns
train_data_1.columns = train_data_1.iloc[0]
train_data_1 = train_data_1[1:]

test_data_1.columns = test_data_1.iloc[0]
test_data_1 = test_data_1[1:]

# Read co3, co4, co5 and c06
co3 = train_data_2.tail()[-4:-3]
co4 = train_data_2.tail()[-3:-2]
co5 = train_data_2.tail()[-2:-1]
co6 = train_data_2.tail()[-1:]

# Remove the labels for co3, co4, co5 and co6
co3.pop('ID_REF')
co4.pop('ID_REF')
co5.pop('ID_REF')
co6.pop('ID_REF')

# convert co3, co4, co5 and co6 into arrays
co3 = np.array(co3)
co4 = np.array(co4)
co5 = np.array(co5)
co6 = np.array(co6)

# Transpose the data set with rows as training numbers
train_data_2 = train_data_2[:-4]
train_data_2 = train_data_2.T

test_data_2 = test_data_2.T

# Make the first row of dataframe as columns
train_data_2.columns = train_data_2.iloc[0]
train_data_2 = train_data_2[1:]

test_data_2.columns = test_data_2.iloc[0]
test_data_2 = test_data_2[1:]

In [None]:
X_ds1_train, X_ds1_test, co1_train, co1_test = train_test_split(train_data_1, co1[0], test_size=0.2, random_state=10)
X_ds1_train, X_ds1_test, co2_train, co2_test = train_test_split(train_data_1, co2[0], test_size=0.2, random_state=10)
X_ds2_train, X_ds2_test, co3_train, co3_test = train_test_split(train_data_2, co3[0], test_size=0.2, random_state=10)
X_ds2_train, X_ds2_test, co4_train, co4_test = train_test_split(train_data_2, co4[0], test_size=0.2, random_state=10)
X_ds2_train, X_ds2_test, co5_train, co5_test = train_test_split(train_data_2, co5[0], test_size=0.2, random_state=10)
X_ds2_train, X_ds2_test, co6_train, co6_test = train_test_split(train_data_2, co6[0], test_size=0.2, random_state=10)

In [None]:
# Convert from integer to float
X_ds1_train_fl = X_ds1_train.astype(float, 64)
X_ds1_test_fl = X_ds1_test.astype(float, 64)
test_data_1_f1 = test_data_1.astype(float, 64)

# Applying the same scaling to both the training and test data
scalar = StandardScaler()
X_ds1_train_scl = scalar.fit_transform(X_ds1_train_fl)
X_ds1_test_scl = scalar.transform(X_ds1_test_fl)
test_data_1_scl = scalar.transform(test_data_1_f1)

# Convert from integer to float
X_ds2_train_fl = X_ds2_train.astype(float, 64)
X_ds2_test_fl = X_ds2_test.astype(float, 64)
test_data_2_f1 = test_data_2.astype(float, 64)

# Applying the same scaling to both the training and test data
scalar = StandardScaler()
X_ds2_train_scl = scalar.fit_transform(X_ds2_train_fl)
X_ds2_test_scl = scalar.transform(X_ds2_test_fl)
test_data_2_scl = scalar.transform(test_data_2_f1)

In [None]:
pca = PCA()
pca.fit_transform(X_ds1_train_scl)

total = sum(pca.explained_variance_)
k = 0
current_variance = 0
while current_variance/total < 0.90:
    current_variance += pca.explained_variance_[k]
    k = k + 1
    
print(k, " features explain around 90% of the variance. From 22283 features to ", k, ", not too bad.", sep='')

pca = PCA(n_components=k)
X_ds1_train.pca = pca.fit(X_ds1_train_scl)
X_ds1_train_pca = pca.transform(X_ds1_train_scl)
X_ds1_test_pca = pca.transform(X_ds1_test_scl)
test_data_1_pca = pca.transform(test_data_1_scl)

var_exp = pca.explained_variance_ratio_.cumsum()
var_exp = var_exp*100
plt.bar(range(k), var_exp);

In [None]:
# Apply PCA on training data
pca = PCA()
pca.fit_transform(X_ds2_train_scl)

total = sum(pca.explained_variance_)
k = 0
current_variance = 0
while current_variance/total < 0.90:
    current_variance += pca.explained_variance_[k]
    k = k + 1
    
print(k, " features explain around 90% of the variance. From 54675 features to ", k, ", not too bad.", sep='')

pca = PCA(n_components=k)
X_ds2_train.pca = pca.fit(X_ds2_train_scl)
X_ds2_train_pca = pca.transform(X_ds2_train_scl)
X_ds2_test_pca = pca.transform(X_ds2_test_scl)
test_data_2_pca = pca.transform(test_data_2_scl)

var_exp = pca.explained_variance_ratio_.cumsum()
var_exp = var_exp*100
plt.bar(range(k), var_exp);

In [None]:
def setSeed():
    np.random.seed(0)
setSeed()

In [None]:
def getAdaBoostClassifierModellr(X_train, X_test, y_train, y_test):
    setSeed()
    abc_param_grid = {
         'n_estimators': [10,20,30],
    }
    
    abc_estimator = AdaBoostClassifier(base_estimator=LogisticRegression(random_state=0),random_state=0)
    
    adaBoost_model = GridSearchCV(abc_estimator, abc_param_grid, cv=3, scoring='accuracy')

    # Train Adaboost Classifer
    adaBoost_model = adaBoost_model.fit(X_train, y_train)
    
    print("Best Parameters:\n", adaBoost_model.best_params_)

    #Predict the response for test dataset
    y_pred = adaBoost_model.predict(X_test)

    accuracy = round(accuracy_score(y_test, y_pred), 3)
    print('AdaBoostClassifier model accuracy:', accuracy)
    
    cm_rf = confusion_matrix(y_test, y_pred)

    ax = plt.subplot()
    sns.heatmap(cm_rf, annot=True, ax = ax, fmt='g', cmap='Greens') 

    # labels, title and ticks
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels') 
    ax.set_title('AdaBoost classifier Confusion Matrix') 
    labels = [0, 1]
    ax.xaxis.set_ticklabels(labels) 
    ax.yaxis.set_ticklabels(labels, rotation=360);
    
    return adaBoost_model, accuracy

In [None]:
co3_model_9, _ = getAdaBoostClassifierModellr(X_ds2_train_pca, X_ds2_test_pca, co3_train, co3_test)

In [None]:
co3_ds1_pred = co3_model_9.predict(test_data_2_pca)
co3_ds1_pred

In [None]:
def getLogisticRegressionModel(X_train, X_test, y_train, y_test, plotConfusionMatrix=True):
    setSeed()
    log_grid = {'C': [1e-03, 1e-2, 1e-1, 1, 10], 
                     'penalty': ['l1', 'l2']}

    log_estimator = LogisticRegression(solver='liblinear', random_state=0)
    
    log_model = GridSearchCV(estimator=log_estimator, 
                      param_grid=log_grid, 
                      cv=3,
                      scoring = 'accuracy')

    log_model.fit(X_train, y_train)

    print("Best Parameters:\n", log_model.best_params_)

    # Select best log model
    best_log = log_model.best_estimator_

    # Make predictions using the optimised parameters
    log_pred = best_log.predict(X_test)
    accuracy = round(accuracy_score(y_test, log_pred), 3)
    print('Logistic Regression accuracy:', accuracy)

    if plotConfusionMatrix:
        cm_log =  confusion_matrix(y_test, log_pred)

        ax = plt.subplot()
        sns.heatmap(cm_log, annot=True, ax = ax, fmt='g', cmap='Greens') 

        # labels, title and ticks
        ax.set_xlabel('Predicted labels')
        ax.set_ylabel('True labels') 
        ax.set_title('Logistic Regression Confusion Matrix') 
        labels = [0, 1]
        ax.xaxis.set_ticklabels(labels) 
        ax.yaxis.set_ticklabels(labels, rotation=360);
    
    return best_log, accuracy

In [None]:
def getSVMModel(X_train_pca, X_test_pca, y_train, y_test, plotConfusionMatrix=True):
    setSeed()
    
    # Parameter grid
    svm_param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001, 0.00001, 10], "kernel": ["linear", "rbf", "poly"], "decision_function_shape" : ["ovo", "ovr"]} 

    # Create SVM grid search classifier
    svm_grid = GridSearchCV(SVC(random_state=0), svm_param_grid, cv=3)

    # Train the classifier
    svm_grid.fit(X_train_pca, y_train)

    print("Best Parameters:\n", svm_grid.best_params_)

    # Select best svc
    best_svc = svm_grid.best_estimator_

    # Make predictions using the optimised parameters
    svm_pred = best_svc.predict(X_test_pca)
    accuracy = round(accuracy_score(y_test, svm_pred), 3)
    print('SVM accuracy:', accuracy)

    if plotConfusionMatrix:
        cm_svm =  confusion_matrix(y_test, svm_pred)

        ax = plt.subplot()
        sns.heatmap(cm_svm, annot=True, ax = ax, fmt='g', cmap='Greens') 

        # Labels, title and ticks
        ax.set_xlabel('Predicted labels')
        ax.set_ylabel('True labels') 
        ax.set_title('SVM Confusion Matrix') 
        labels = [0, 1]
        ax.xaxis.set_ticklabels(labels) 
        ax.yaxis.set_ticklabels(labels, rotation=360);
    
    return best_svc, accuracy

In [None]:
def rfModel(X_train, X_test, y_train, y_test, plotConfusionMatrix=True):
    setSeed()
    rf_param_grid = {
         'bootstrap': [True, False],
#          'max_depth': [10, 20, 30],
#          'min_samples_leaf': [1, 2, 4],
#          'min_samples_split': [2, 5, 10],
         'min_samples_leaf': [8, 10, 12, 14],
         'min_samples_split': [3, 5, 7],
         'n_estimators': [50, 80, 100],
    }
    
    # Instantiate random forest classifier
    rf_estimator = RandomForestClassifier(random_state=0)
    
    # Create the GridSearchCV object
    rf_model = GridSearchCV(estimator=rf_estimator, param_grid=rf_param_grid, cv=3, scoring='accuracy')
    
    # Fine-tune the hyperparameters
    rf_model.fit(X_train, y_train)
    
    print("Best Parameters:\n", rf_model.best_params_)
    
    # Get the best model
    rf_model_best = rf_model.best_estimator_

    # Make predictions using the optimised parameters
    rf_pred = rf_model_best.predict(X_test)
    
    accuracy = round(accuracy_score(y_test, rf_pred), 3)
    print('Random Forest accuracy:', accuracy)

    if plotConfusionMatrix:
        cm_rf = confusion_matrix(y_test, rf_pred)

        ax = plt.subplot()
        sns.heatmap(cm_rf, annot=True, ax = ax, fmt='g', cmap='Greens') 

        # labels, title and ticks
        ax.set_xlabel('Predicted labels')
        ax.set_ylabel('True labels') 
        ax.set_title('Random Forest Confusion Matrix') 
        labels = [0, 1]
        ax.xaxis.set_ticklabels(labels) 
        ax.yaxis.set_ticklabels(labels, rotation=360);

    return rf_model_best, accuracy

In [None]:
def votingModel(X_train, X_test, y_train, y_test):
    
    # Select best logistic regression model
    best_log, _ = getLogisticRegressionModel(X_train, X_test, y_train, y_test, plotConfusionMatrix=False)

    # Select best svc
    best_svc, _ = getSVMModel(X_train, X_test, y_train, y_test, plotConfusionMatrix=False)
    
    # Select best rf
    best_rf, _ = rfModel(X_train, X_test, y_train, y_test, plotConfusionMatrix=False)
    
    # group / ensemble of models
    estimator = []
    estimator.append(('LR', best_log))
    estimator.append(('SVC', best_svc))
    estimator.append(('NBC', best_rf))

    # Voting Classifier with hard voting
    setSeed()
    vot_Model = VotingClassifier(estimators = estimator, voting ='hard')
    vot_Model.fit(X_train, y_train)
    y_pred = vot_Model.predict(X_test)

    accuracy = round(accuracy_score(y_test, y_pred), 3)
    print('Voting model accuracy:', accuracy)

    cm_rf = confusion_matrix(y_test, y_pred)

    ax = plt.subplot()
    sns.heatmap(cm_rf, annot=True, ax = ax, fmt='g', cmap='Greens') 

    # labels, title and ticks
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels') 
    ax.set_title('Voting classifier Confusion Matrix') 
    labels = [0, 1]
    ax.xaxis.set_ticklabels(labels) 
    ax.yaxis.set_ticklabels(labels, rotation=360);
    
    return vot_Model, accuracy


In [None]:
def getAdaBoostClassifierModeldt(X_train, X_test, y_train, y_test):
    setSeed()
    abc_param_grid = {
         'n_estimators': [10,20,30],
    }
    
    abc_estimator = AdaBoostClassifier(DecisionTreeClassifier(random_state=0, max_depth=1))
    
    adaBoost_model = GridSearchCV(abc_estimator, abc_param_grid, cv=3, scoring='accuracy')

    # Train Adaboost Classifer
    adaBoost_model = adaBoost_model.fit(X_train, y_train)
    
    print("Best Parameters:\n", adaBoost_model.best_params_)

    #Predict the response for test dataset
    y_pred = adaBoost_model.predict(X_test)

    accuracy = round(accuracy_score(y_test, y_pred), 3)
    print('AdaBoostClassifier model accuracy:', accuracy)
    
    cm_rf = confusion_matrix(y_test, y_pred)

    ax = plt.subplot()
    sns.heatmap(cm_rf, annot=True, ax = ax, fmt='g', cmap='Greens') 

    # labels, title and ticks
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels') 
    ax.set_title('AdaBoost classifier Confusion Matrix') 
    labels = [0, 1]
    ax.xaxis.set_ticklabels(labels) 
    ax.yaxis.set_ticklabels(labels, rotation=360);
    
    return adaBoost_model, accuracy

In [None]:
def getAdaBoostClassifierModelsv(X_train, X_test, y_train, y_test):
    setSeed()
    abc_param_grid = {
         'n_estimators': [10, 20, 30],
    }
    
    abc_estimator = AdaBoostClassifier(SVC(kernel='rbf',probability=True),n_estimators=15,  learning_rate=1.0, algorithm='SAMME.R')
    
    adaBoost_model = GridSearchCV(abc_estimator, abc_param_grid, cv=3, scoring='accuracy')

    # Train Adaboost Classifer
    adaBoost_model = adaBoost_model.fit(X_train, y_train)
    
    print("Best Parameters:\n", adaBoost_model.best_params_)

    #Predict the response for test dataset
    y_pred = adaBoost_model.predict(X_test)

    accuracy = round(accuracy_score(y_test, y_pred), 3)
    print('AdaBoostClassifier model accuracy:', accuracy)
    
    cm_rf = confusion_matrix(y_test, y_pred)

    ax = plt.subplot()
    sns.heatmap(cm_rf, annot=True, ax = ax, fmt='g', cmap='Greens') 

    # labels, title and ticks
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels') 
    ax.set_title('AdaBoost classifier Confusion Matrix') 
    labels = [0, 1]
    ax.xaxis.set_ticklabels(labels) 
    ax.yaxis.set_ticklabels(labels, rotation=360);
    
    return adaBoost_model, accuracy

In [None]:
def getAdaBoostClassifierModelrf(X_train, X_test, y_train, y_test):
    setSeed()
    abc_param_grid = {
         'n_estimators': [10, 20, 30],
    }
    
    abc_estimator = AdaBoostClassifier(base_estimator=RandomForestClassifier(random_state=0))
    
    adaBoost_model = GridSearchCV(abc_estimator, abc_param_grid, cv=3, scoring='accuracy')

    # Train Adaboost Classifer
    adaBoost_model = adaBoost_model.fit(X_train, y_train)
    
    print("Best Parameters:\n", adaBoost_model.best_params_)

    #Predict the response for test dataset
    y_pred = adaBoost_model.predict(X_test)

    accuracy = round(accuracy_score(y_test, y_pred), 3)
    print('AdaBoostClassifier model accuracy:', accuracy)
    
    cm_rf = confusion_matrix(y_test, y_pred)

    ax = plt.subplot()
    sns.heatmap(cm_rf, annot=True, ax = ax, fmt='g', cmap='Greens') 

    # labels, title and ticks
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels') 
    ax.set_title('AdaBoost classifier Confusion Matrix') 
    labels = [0, 1]
    ax.xaxis.set_ticklabels(labels) 
    ax.yaxis.set_ticklabels(labels, rotation=360);
    
    return adaBoost_model, accuracy

In [None]:
co1_model_5, _ = votingModel(X_ds1_train_pca, X_ds1_test_pca, co1_train, co1_test)

In [None]:
co1_ds1_pred = co1_model_5.predict(test_data_1_pca)
co1_ds1_pred

In [None]:
co2_model_1, _ = getLogisticRegressionModel(X_ds1_train, X_ds1_test, co2_train, co2_test)

In [None]:
co2_ds1_pred = co2_model_1.predict(test_data_1)
co2_ds1_pred

In [None]:
co3_ds1_pred

In [None]:
co4_model_2, _ = getLogisticRegressionModel(X_ds2_train_pca, X_ds2_test_pca, co4_train, co4_test)

In [None]:
co4_ds1_pred = co4_model_2.predict(test_data_2_pca)
co4_ds1_pred

In [None]:
co5_model_1, _ = getLogisticRegressionModel(X_ds2_train, X_ds2_test, co5_train, co5_test)

In [None]:
co5_ds1_pred = co5_model_1.predict(test_data_2)
co5_ds1_pred

In [None]:
co6_model_6, _ = getAdaBoostClassifierModeldt(X_ds2_train, X_ds2_test, co6_train, co6_test)

In [None]:
co6_ds1_pred = co6_model_6.predict(test_data_2)
co6_ds1_pred

In [None]:
i = 0
with open('/kaggle/working/predictions.csv', 'w') as f:
    f.write('Id,Predicted\n')
    
    # write co1
    for co1 in co1_ds1_pred:
        f.write('{},{}\n'.format(i, int(co1)))
        i = i + 1
        
    # write co2
    for co2 in co2_ds1_pred:
        f.write('{},{}\n'.format(i, int(co2)))
        i = i + 1
        
    # write co3
    for co3 in co3_ds1_pred:
        f.write('{},{}\n'.format(i, int(co3)))
        i = i + 1
        
    # write co4
    for co4 in co4_ds1_pred:
        f.write('{},{}\n'.format(i, int(co4)))
        i = i + 1
    
    # write co5
    for co5 in co5_ds1_pred:
        f.write('{},{}\n'.format(i, int(co5)))
        i = i + 1
    
    # write co6
    for co6 in co6_ds1_pred:
        f.write('{},{}\n'.format(i, int(co6)))
        i = i + 1