In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys

import csv
import datetime
import itertools
import numpy as np
import pandas as pd
import random
import re
import sklearn
import time
from collections import defaultdict

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context(rc={
       "figure.figsize": (16, 10),
       "axes.titlesize": 14})

from IPython.display import Image, display
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

from os.path import expanduser
sys.path.insert(1, '{}/datsci'.format(expanduser('~')))
from datsci import eda, munge, ml
from datsci import kaggle as kg

In [2]:
HTML("<style>.container { width:100% !important; }</style>")

In [3]:
FILE_TEST = 'data/test.csv'
FILE_TRAIN = 'data/train.csv'
FILE_TRAIN_DEDUP = 'data/train.dedup.csv'
FILE_SAMPLE_SUBMIT = 'data/sample_submission.csv'

In [30]:
df_test = pd.read_csv(FILE_TEST, index_col='ID')

## First submission - no processing

In [4]:
from sklearn.cross_validation import train_test_split

# Read in data
df = pd.read_csv(FILE_TRAIN, index_col='ID')

# Split up the data
target_col = 'TARGET'
feature_cols = list(df.columns)
feature_cols.remove(target_col)

X_all = df[feature_cols]  # feature values for all students
y_all = df[target_col]

test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

In [16]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import SGDClassifier as SGDClf

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


# SGD with linear svm
sgdclf_svm = SGDClf(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15,
                    n_iter=5, shuffle=True, n_jobs=1, random_state=0, learning_rate='optimal',
                    power_t=0.5, class_weight=None, warm_start=False, average=False)

# SGD with logistic regression
sgdclf_logistic = SGDClf(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15,
                         n_iter=5, shuffle=True, n_jobs=1, random_state=0, learning_rate='optimal',
                         power_t=0.5, class_weight=None, warm_start=False, average=False)

descriptions_clfs = [
    ("SGD linear svm", sgdclf_svm),
    ("SGD logistic", sgdclf_logistic),
#     ("SVC Linear kernel", SVC(C=1.0, kernel='linear', gamma='auto')),
#     ("SVC polynomial deg 2 kernel", SVC(C=1.0, kernel='poly', degree=2, gamma='auto')),
#     ("SVC polynomial deg 3 kernel", SVC(C=1.0, kernel='poly', degree=3, gamma='auto')),
#     ("SVC rbf kernel", SVC(C=1.0, kernel='rbf', gamma='auto')),
#     ("KNeighbors, 3 neighbors", KNC(n_neighbors=3, weights='uniform')),
#     ("RandomForest, 10 estimators", RFC(n_estimators=10, max_depth=None, min_samples_split=2, n_jobs=2)),
#     ("LogisticRegression", LogisticRegression(C=1.0, penalty='l2', random_state=0, multi_class='ovr', n_jobs=4)),
#     ("GradientBoostingClassifier", GBC(loss='deviance', learning_rate=0.1, n_estimators=10, max_depth=None, min_samples_split=2)),
#     ("AdaBoostClassifier w SVC linear kernel", ABC(SVC(C=1.0, kernel='linear', gamma='auto'), n_estimators=10, learning_rate=1.0, algorithm='SAMME'))
]

In [6]:
no_processing_prelim_results = ml.train_predict(descriptions_clfs, X_train, y_train, X_test, y_test, scoring=f1_score)

In [7]:
no_processing_prelim_results

Unnamed: 0,description,score_train,score_test,time_train,time_predict_train,time_predict_test
0,SGD linear svm,0.0,0.002183,0.378594,0.103345,0.047913
1,SGD logistic,0.0,0.002193,0.369095,0.108437,0.028139


In [23]:
# SGD with linear svm
sgdclf_svm = SGDClf(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15,
                    shuffle=True, n_jobs=1, random_state=0, learning_rate='optimal',
                    power_t=0.5, class_weight=None, warm_start=False, average=False)

best_score, best_model = ml.fine_tune_params(sgdclf_svm, X_train, y_train, X_test, y_test, {'n_iter': [10, 100]},
                                             n_runs=3, n_cv=5, scoring=f1_score, n_jobs=1)

iteration 0
Each iteration time(secs): 16.376
iteration 1
Each iteration time(secs): 17.869
iteration 2
Each iteration time(secs): 18.013


In [24]:
print(best_score, best_model.n_iter)

0.00748362956034 10


In [None]:
# SGD with logistic regression
sgdclf_logistic = SGDClf(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15,
                         shuffle=True, n_jobs=1, random_state=0, learning_rate='optimal',
                         power_t=0.5, class_weight=None, warm_start=False, average=False)

best_score, best_model = ml.fine_tune_params(sgdclf_logistic, X_train, y_train, X_test, y_test, {'n_iter': [100, 500, 1000]},
                                             n_runs=3, n_cv=5, scoring=f1_score, n_jobs=1)

iteration 0


In [26]:
print(best_score, best_model.n_iter)

0.00427807486631 100


In [26]:
print(best_score, best_model.n_iter)

0.00427807486631 100


In [35]:
kg.save_submission(best_model.predict(df_test), 'submissions/unprocessed.sgd.csv')