In [1]:
# Try out basic techniques for the A

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV 
from sklearn.utils import shuffle

INPUT_PATH = "../inputs/train.csv"
TEST_PATH = "../inputs/test.csv"


def read_data(path, header=1):
    return pd.read_csv(path)


def split_data_set():
    data_set = read_data(INPUT_PATH)
    test_data = read_data(TEST_PATH)
    data_set = shuffle(data_set)
    
    sets = np.split(data_set, [0], 0)
    train_y = np.asarray(sets[1]['label'])
    train_X = np.asarray(sets[1].loc[:, sets[1].columns != 'label'])
    
    test_X = np.asarray(test_data)[:, 1:]
    
    return train_X, test_X, train_y


def try_classifier(clf, params={}, search='grid'):
    train_X, test_X, train_y = split_data_set()
    if search == 'grid' and params is not None:
        print("\n\nGrid Search Performed on %s" % type(clf).__name__)
        print("Grid Search params are: %s" % params)
        clf = GridSearchCV(clf, param_grid=params)
        clf.fit(train_X, train_y)
        print("Cross Validation Score: ", clf.best_score_)
        print("GridSearch Best Params: ", clf.best_params_)
        with open('out.csv', 'w') as f:
            f.write('Id,Prediction\n')
        np.savetxt(open('out.csv', 'ab'), np.c_[np.arange(1, len(test_X)+1), clf.predict(test_X)], '%d', delimiter=',')
    elif search == 'random' and params is not None:
        print("\n\nRandom Search Performed on %s" % type(clf).__name__)
        print("Random Search params are: %s" % params)
        clf = RandomizedSearchCV(clf, param_distributions=params)
        clf.fit(train_X, train_y)
        print("Cross Validation Score: ", clf.best_score_)
        print("RandomSearch Best Params: ", clf.best_params_)
        with open('out.csv', 'w') as f:
            f.write('Id,Prediction\n')
        np.savetxt(open('out.csv', 'ab'), np.c_[np.arange(1, len(test_X)+1), clf.predict(test_X)], '%d', delimiter=',')


In [2]:
from sklearn.linear_model import LogisticRegression

# No parameter optimization. Received the following results before re running
# Grid Search Performed on LogisticRegression
# Grid Search params are: {}
# Cross Validation Score:  0.504
# GridSearch Best Params:  {}

clf = LogisticRegression()
try_classifier(clf)



Grid Search Performed on LogisticRegression
Grid Search params are: {}
('Cross Validation Score: ', 0.47999999999999998)
('GridSearch Best Params: ', {})


In [3]:
grid = [{
    'C': np.power(10.0, np.arange(-7, -4)),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
    'penalty': ['l2']}, {
    'C': np.power(10.0, np.arange(-7, -4)),
    'solver': ['liblinear', 'saga'], 
    'penalty': ['l1']}]
try_classifier(clf, grid)

# Optimized before and received the following results
# Grid Search Performed on LogisticRegression
# Grid Search params are: [{'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'penalty': ['l2'], 'C': array([  1.00000000e-07,   1.00000000e-06,   1.00000000e-05])}, {'solver': ['liblinear', 'saga'], 'penalty': ['l1'], 'C': array([  1.00000000e-07,   1.00000000e-06,   1.00000000e-05])}]
# Cross Validation Score:  0.631
# GridSearch Best Params:  {'solver': 'newton-cg', 'penalty': 'l2', 'C': 1.0000000000000001e-05}



Grid Search Performed on LogisticRegression
Grid Search params are: [{'penalty': ['l2'], 'C': array([  1.00000000e-07,   1.00000000e-06,   1.00000000e-05]), 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}, {'penalty': ['l1'], 'C': array([  1.00000000e-07,   1.00000000e-06,   1.00000000e-05]), 'solver': ['liblinear', 'saga']}]
('Cross Validation Score: ', 0.623)
('GridSearch Best Params: ', {'penalty': 'l2', 'C': 9.9999999999999995e-07, 'solver': 'newton-cg'})


In [4]:
from sklearn.svm import LinearSVC, SVC
clf = LinearSVC()
try_classifier(clf)



Grid Search Performed on LinearSVC
Grid Search params are: {}
('Cross Validation Score: ', 0.51200000000000001)
('GridSearch Best Params: ', {})


In [5]:
grid = {'C': np.logspace(-7, -6, 100),
    'loss': ['squared_hinge'], 
    'penalty': ['l2'],
    'dual':[True, False]}
clf = LinearSVC()
try_classifier(clf, grid)



Grid Search Performed on LinearSVC
Grid Search params are: {'penalty': ['l2'], 'loss': ['squared_hinge'], 'C': array([  1.00000000e-07,   1.02353102e-07,   1.04761575e-07,
         1.07226722e-07,   1.09749877e-07,   1.12332403e-07,
         1.14975700e-07,   1.17681195e-07,   1.20450354e-07,
         1.23284674e-07,   1.26185688e-07,   1.29154967e-07,
         1.32194115e-07,   1.35304777e-07,   1.38488637e-07,
         1.41747416e-07,   1.45082878e-07,   1.48496826e-07,
         1.51991108e-07,   1.55567614e-07,   1.59228279e-07,
         1.62975083e-07,   1.66810054e-07,   1.70735265e-07,
         1.74752840e-07,   1.78864953e-07,   1.83073828e-07,
         1.87381742e-07,   1.91791026e-07,   1.96304065e-07,
         2.00923300e-07,   2.05651231e-07,   2.10490414e-07,
         2.15443469e-07,   2.20513074e-07,   2.25701972e-07,
         2.31012970e-07,   2.36448941e-07,   2.42012826e-07,
         2.47707636e-07,   2.53536449e-07,   2.59502421e-07,
         2.65608778e-07,   2.7185

In [6]:
clf = SVC()
try_classifier(clf)



Grid Search Performed on SVC
Grid Search params are: {}
('Cross Validation Score: ', 0.105)
('GridSearch Best Params: ', {})


In [7]:
grid = {'C': np.linspace(0.00000080133703895174385, 0.00000256399, 400),
    'kernel': ['linear'],
    'shrinking': [True]}
try_classifier(clf, grid)



Grid Search Performed on SVC
Grid Search params are: {'kernel': ['linear'], 'C': array([  8.01337039e-07,   8.05754716e-07,   8.10172392e-07,
         8.14590069e-07,   8.19007745e-07,   8.23425422e-07,
         8.27843099e-07,   8.32260775e-07,   8.36678452e-07,
         8.41096128e-07,   8.45513805e-07,   8.49931481e-07,
         8.54349158e-07,   8.58766835e-07,   8.63184511e-07,
         8.67602188e-07,   8.72019864e-07,   8.76437541e-07,
         8.80855218e-07,   8.85272894e-07,   8.89690571e-07,
         8.94108247e-07,   8.98525924e-07,   9.02943601e-07,
         9.07361277e-07,   9.11778954e-07,   9.16196630e-07,
         9.20614307e-07,   9.25031984e-07,   9.29449660e-07,
         9.33867337e-07,   9.38285013e-07,   9.42702690e-07,
         9.47120367e-07,   9.51538043e-07,   9.55955720e-07,
         9.60373396e-07,   9.64791073e-07,   9.69208750e-07,
         9.73626426e-07,   9.78044103e-07,   9.82461779e-07,
         9.86879456e-07,   9.91297132e-07,   9.95714809e-07,
  

('Cross Validation Score: ', 0.60399999999999998)
('GridSearch Best Params: ', {'kernel': 'linear', 'C': 1.4419001450971002e-06, 'shrinking': True})


In [8]:
# ('Cross Validation Score: ', 0.624)
# ('GridSearch Best Params: ', SVC{'kernel': 'linear', 'C': 1.967603659795703e-06, 'shrinking': True})
# Yieleded 66.250 percent testing score

In [9]:
from sklearn.neural_network import MLPClassifier as MLP

clf = MLP()
try_classifier(clf)



Grid Search Performed on MLPClassifier
Grid Search params are: {}
('Cross Validation Score: ', 0.498)
('GridSearch Best Params: ', {})


In [10]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

clf = MultinomialNB()
grid = {'alpha': np.logspace(-10, 5, 1000),
       'fit_prior': [True, False]}
try_classifier(clf, grid)



Grid Search Performed on MultinomialNB
Grid Search params are: {'alpha': array([  1.00000000e-10,   1.03517796e-10,   1.07159340e-10,
         1.10928986e-10,   1.14831241e-10,   1.18870770e-10,
         1.23052400e-10,   1.27381132e-10,   1.31862140e-10,
         1.36500781e-10,   1.41302599e-10,   1.46273336e-10,
         1.51418933e-10,   1.56745541e-10,   1.62259529e-10,
         1.67967487e-10,   1.73876240e-10,   1.79992851e-10,
         1.86324631e-10,   1.92879151e-10,   1.99664245e-10,
         2.06688025e-10,   2.13958887e-10,   2.21485523e-10,
         2.29276931e-10,   2.37342425e-10,   2.45691646e-10,
         2.54334576e-10,   2.63281547e-10,   2.72543253e-10,
         2.82130768e-10,   2.92055551e-10,   3.02329468e-10,
         3.12964801e-10,   3.23974263e-10,   3.35371015e-10,
         3.47168682e-10,   3.59381366e-10,   3.72023668e-10,
         3.85110700e-10,   3.98658107e-10,   4.12682085e-10,
         4.27199397e-10,   4.42227398e-10,   4.57784054e-10,
         4

('Cross Validation Score: ', 0.59399999999999997)
('GridSearch Best Params: ', {'alpha': 212.48453524988784, 'fit_prior': True})


In [14]:
from sklearn.neighbors import KNeighborsClassifier
for ii in range(10):
    clf = KNeighborsClassifier()
    grid = {'n_neighbors': [10], 
            'weights' : ['uniform'], 
            'algorithm': ['auto'], 
            'p' : [1], 
            'leaf_size': [3]}
    try_classifier(clf, grid)



Grid Search Performed on KNeighborsClassifier
Grid Search params are: {'n_neighbors': [10], 'weights': ['uniform'], 'leaf_size': [3], 'algorithm': ['auto'], 'p': [1]}
('Cross Validation Score: ', 0.58899999999999997)
('GridSearch Best Params: ', {'n_neighbors': 10, 'weights': 'uniform', 'leaf_size': 3, 'algorithm': 'auto', 'p': 1})


Grid Search Performed on KNeighborsClassifier
Grid Search params are: {'n_neighbors': [10], 'weights': ['uniform'], 'leaf_size': [3], 'algorithm': ['auto'], 'p': [1]}
('Cross Validation Score: ', 0.58899999999999997)
('GridSearch Best Params: ', {'n_neighbors': 10, 'weights': 'uniform', 'leaf_size': 3, 'algorithm': 'auto', 'p': 1})


Grid Search Performed on KNeighborsClassifier
Grid Search params are: {'n_neighbors': [10], 'weights': ['uniform'], 'leaf_size': [3], 'algorithm': ['auto'], 'p': [1]}
('Cross Validation Score: ', 0.60399999999999998)
('GridSearch Best Params: ', {'n_neighbors': 10, 'weights': 'uniform', 'leaf_size': 3, 'algorithm': 'auto', '

In [52]:
from  sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=-1)
grid = {'n_estimators' : [1000],
       'criterion' : ['gini'], 
       'max_features': ['sqrt', 'log2', None], 
       'min_samples_split' : np.logspace(-5, -1, 10), 
       'min_samples_leaf' : np.logspace(-5, -1, 10)}
try_classifier(clf, grid)



Grid Search Performed on RandomForestClassifier
Grid Search params are: {'n_estimators': [1000], 'max_features': ['sqrt', 'log2', None], 'min_samples_split': array([  1.00000000e-05,   2.78255940e-05,   7.74263683e-05,
         2.15443469e-04,   5.99484250e-04,   1.66810054e-03,
         4.64158883e-03,   1.29154967e-02,   3.59381366e-02,
         1.00000000e-01]), 'criterion': ['gini'], 'min_samples_leaf': array([  1.00000000e-05,   2.78255940e-05,   7.74263683e-05,
         2.15443469e-04,   5.99484250e-04,   1.66810054e-03,
         4.64158883e-03,   1.29154967e-02,   3.59381366e-02,
         1.00000000e-01])}
('Cross Validation Score: ', 0.629)
('GridSearch Best Params: ', {'max_features': 'sqrt', 'min_samples_split': 7.7426368268112782e-05, 'n_estimators': 1000, 'criterion': 'gini', 'min_samples_leaf': 2.782559402207126e-05})


In [None]:
# Grid Search Performed on RandomForestClassifier
# Grid Search params are: {'n_estimators': [10, 100, 1000]}
# ('Cross Validation Score: ', 0.623)
# ('GridSearch Best Params: ', {'n_estimators': 1000})

In [1]:
from sklearn.ensemble import VotingClassifier

classifiers = [
    LogisticRegression(solver=newton-cg, penalty='l2', C=1.0000000000000001e-05), 
    
]