In [3]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt

In [4]:
def load_in_data():
    x_tr = pd.read_csv('x_train.csv')
    y_tr = pd.read_csv('y_train.csv')
    x_cv = pd.read_csv('x_val.csv')
    y_cv = pd.read_csv('y_val.csv')
    x_te = pd.read_csv('x_test.csv')
    y_te = pd.read_csv('y_test.csv')
    return x_tr, y_tr, x_cv, y_cv, x_te, y_te

In [5]:
x_tr, y_tr, x_cv, y_cv, x_te, y_te = load_in_data()

In [6]:
x_tr.fillna(x_tr.mean(), inplace=True)
x_cv.fillna(x_cv.mean(), inplace=True)
x_te.fillna(x_te.mean(), inplace=True)

In [7]:
def plot_learning_curve(estimator, x_tr, y_tr):
    train_sizes, train_scores, test_scores = \
        learning_curve(estimator, x_tr, y_tr, train_sizes = np.linspace(0.2, 1, 20))
    
    plt.plot(train_sizes, -train_scores.mean(1), 'o-', label = str(estimator.__class__.__name__) + ' train')
    plt.plot(train_sizes, -test_scores.mean(1), 'o-', label = str(estimator.__class__.__name__) + ' test')
    plt.legend(loc="best")

In [8]:
def do_stuff(model, x_tr, y_tr, x_cv, y_cv):
    clf = OneVsRestClassifier(model).fit(x_tr, y_tr)
    predict_tr = clf.predict(x_tr).reshape((-1, 1))
    predict_cv = clf.predict(x_cv).reshape((-1, 1))
    print(np.mean(predict_tr == y_tr))
    print(np.mean(predict_cv == y_cv))
    #plot_learning_curve(clf, x_tr, y_tr)
    return clf

In [9]:
def logistic_model(x_tr, y_tr, x_cv, y_cv):
    log = linear_model.LogisticRegression(max_iter = 10000, penalty = 'l2')
    return do_stuff(log, x_tr, y_tr, x_cv, y_cv)

def svm_model(x_tr, y_tr, x_cv, y_cv):
    svc = svm.SVC(kernel = 'poly', max_iter = 10000)
    return do_stuff(svc, x_tr, y_tr, x_cv, y_cv)
    
def rf_model(x_tr, y_tr, x_cv, y_cv):
    rf = ensemble.RandomForestClassifier(max_depth=2, random_state=0)
    return do_stuff(rf, x_tr, y_tr, x_cv, y_cv)

In [10]:
logistic_model(x_tr, y_tr, x_cv, y_cv)

0    0.560871
dtype: float64
0    0.510549
dtype: float64


OneVsRestClassifier(estimator=LogisticRegression(max_iter=10000))

In [None]:
svm_model(x_tr, y_tr, x_cv, y_cv)

In [12]:
rf_model(x_tr, y_tr, x_cv, y_cv)

0    0.483047
dtype: float64
0    0.486076
dtype: float64


OneVsRestClassifier(estimator=RandomForestClassifier(max_depth=2,
                                                     random_state=0))

In [11]:
clf = logistic_model(x_tr, y_tr, x_cv, y_cv)
predict_te = clf.predict(x_te).reshape((-1, 1))
np.mean(predict_te == y_te)

0    0.560871
dtype: float64
0    0.510549
dtype: float64


0    0.547679
dtype: float64