In [34]:
# Try out basic techniques for the A

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV 
from sklearn.utils import shuffle

INPUT_PATH = "../inputs/train.csv"
TEST_PATH = "../inputs/test.csv"


def read_data(path, header=1):
    return pd.read_csv(path)


def split_data_set():
    data_set = read_data(INPUT_PATH)
    test_data = read_data(TEST_PATH)
    data_set = shuffle(data_set)
    
    sets = np.split(data_set, [0], 0)
    train_y = np.asarray(sets[1]['label'])
    train_X = np.asarray(sets[1].loc[:, sets[1].columns != 'label'])
    
    test_X = np.asarray(test_data)[:, 1:]
    
    return train_X, test_X, train_y


def try_classifier(clf, params={}, search='grid'):
    train_X, test_X, train_y = split_data_set()
    if search == 'grid' and params is not None:
        print("\n\nGrid Search Performed on %s" % type(clf).__name__)
        print("Grid Search params are: %s" % params)
        clf = GridSearchCV(clf, param_grid=params)
        clf.fit(train_X, train_y)
        print("Cross Validation Score: ", clf.best_score_)
        print("GridSearch Best Params: ", clf.best_params_)
        with open('out.csv', 'w') as f:
            f.write('Id,Prediction\n')
        np.savetxt(open('out.csv', 'ab'), np.c_[np.arange(1, len(test_X)+1), clf.predict(test_X)], '%d', delimiter=',')
    elif search == 'random' and params is not None:
        print("\n\nRandom Search Performed on %s" % type(clf).__name__)
        print("Random Search params are: %s" % params)
        clf = RandomizedSearchCV(clf, param_distributions=params)
        clf.fit(train_X, train_y)
        print("Cross Validation Score: ", clf.best_score_)
        print("RandomSearch Best Params: ", clf.best_params_)
        with open('out.csv', 'w') as f:
            f.write('Id,Prediction\n')
        np.savetxt(open('out.csv', 'ab'), np.c_[np.arange(1, len(test_X)+1), clf.predict(test_X)], '%d', delimiter=',')


In [35]:
from sklearn.linear_model import LogisticRegression

# No parameter optimization
clf = LogisticRegression()
try_classifier(clf)



Grid Search Performed on LogisticRegression
Grid Search params are: {}
Cross Validation Score:  0.504
GridSearch Best Params:  {}


In [37]:
grid = [{
    'C': np.power(10.0, np.arange(-7, -4)),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
    'penalty': ['l2']}, {
    'C': np.power(10.0, np.arange(-7, -4)),
    'solver': ['liblinear', 'saga'], 
    'penalty': ['l1']}]
try_classifier(clf, grid)



Grid Search Performed on LogisticRegression
Grid Search params are: [{'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'penalty': ['l2'], 'C': array([  1.00000000e-07,   1.00000000e-06,   1.00000000e-05])}, {'solver': ['liblinear', 'saga'], 'penalty': ['l1'], 'C': array([  1.00000000e-07,   1.00000000e-06,   1.00000000e-05])}]
Cross Validation Score:  0.631
GridSearch Best Params:  {'solver': 'newton-cg', 'penalty': 'l2', 'C': 1.0000000000000001e-05}


In [41]:
from sklearn.gaussian_process import GaussianProcessClassifier
clf = GaussianProcessClassifier()
try_classifier(clf)



Grid Search Performed on GaussianProcessClassifier
Grid Search params are: {}
Cross Validation Score:  0.1
GridSearch Best Params:  {}
