In [26]:
import numpy as np
import math as m
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from IPython.core.interactiveshell import InteractiveShell

from pandas.plotting import scatter_matrix

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

import graphviz
import lime
import xgboost as xgb
InteractiveShell.ast_node_interactivity = "all"

In [6]:
#############################################################
# Pipeline for train.csv
#############################################################
def pipeline(data):
    # Drop empty columns
    garbage = ['F25', 'F26', 'F27']
    data.drop(garbage, axis=1, inplace=True)
    
    # Drop columns with very low correlation to label
    low_corr = ['F20', 'F23', 'F21', 'F18', 'F1', 'F24', 
            'F11', 'F13', 'F2', 'F15', 'F8', 'F14', 'F22']
    data.drop(low_corr, axis=1, inplace=True)
    data.drop(['id'], axis=1, inplace=True)
    
    # Drop duplicate columns
    dups = ['F9', 'F12']
    data.drop(dups, axis=1, inplace=True)

    # F6
    for i in range(10):
        data_point = data['F6'].idxmax()
        data.drop([data_point], inplace=True)
    data.F6 = np.log(data.F6)

    # F16
    data = data[data['F16'] > 115000]
    data.F16 -= data.F16.min()
    data.F16 /= m.sqrt(data.F16.std())

    # F20
    #data = data[data.F20 != 12]
    
    # F3
    data.F3 += 1
    data.F3 = np.log(data.F3)
    
    # F4
    data = zeroMean(data, 'F4')
    
    # F5
    data = data[data.F5 < 180000]
    data.F5 -= data.F5.min()
    data.F5 /= m.sqrt(data.F5.std())
    
    # F7
    column = 'F7'
    data.loc[data[column] < 75000, column] = 1
    data.loc[(data[column] < 215000) & (data[column] > 2), column] = 2
    data.loc[data[column] > 215000, column] = 3
    
    # F10
    column = 'F10'
    data = data[data[column] < 200000]
    data = data[data[column] > 120000]
    data.F10 -= data.F10.min()
    data.F10 /= m.sqrt(data.F10.std())
    
    # F17
    column = 'F17'
    data.F17 -= data.F17.min()
    data.F17 /= m.sqrt(data[column].std())
    
    # F19
    data = data[data.F19 < 300000]
    data.F19 /= m.sqrt(data.F19.std())
    
    return data

#############################################################
# Pipeline for test.csv
#############################################################
def testPipeline(data):
     # Drop columns with very low correlation to label
    low_corr = ['F20', 'F23', 'F21', 'F18', 'F1', 'F24', 
            'F11', 'F13', 'F2', 'F15', 'F8', 'F14', 'F22']
    data.drop(low_corr, axis=1, inplace=True)
    data.drop(['id'], axis=1, inplace=True)
    
    # Drop duplicate columns
    dups = ['F9', 'F12']
    data.drop(dups, axis=1, inplace=True)

    # F6
    data.F6 = np.log(data.F6)

    # F16
    data.F16 -= data.F16.min()
    data.F16 /= m.sqrt(data.F16.std())

    # F20
    #data = data[data.F20 != 12]
    
    # F3
    data.F3 += 1
    data.F3 = np.log(data.F3)
    
    # F4
    data = zeroMean(data, 'F4')
    
    # F5
    data.F5 -= data.F5.min()
    data.F5 /= m.sqrt(data.F5.std())
    
    # F7
    column = 'F7'
    data.loc[data[column] < 75000, column] = 1
    data.loc[(data[column] < 215000) & (data[column] > 2), column] = 2
    data.loc[data[column] > 215000, column] = 3
    
    # F10
    data.F10 -= data.F10.min()
    data.F10 /= m.sqrt(data.F10.std())
    
    # F17
    column = 'F17'
    data.F17 -= data.F17.min()
    data.F17 /= m.sqrt(data[column].std())
    
    # F19
    data.F19 /= m.sqrt(data.F19.std())
    
    return data

#############################################################
# Writes a file for Kaggle Submission
#############################################################
def makeFile(pred, filename):
    new_index = np.arange(16384,32769,1)
    id_col = pd.DataFrame(new_index, columns=['id'], dtype='int32')
    y_hat = pd.DataFrame(pred, columns=['Y'])
    frames = [id_col, y_hat]
    pred = pd.concat(frames, axis=1)
    pred.to_csv(filename, encoding='utf-8', index=False)

def zeroMean(data, column):
    data[column] -= data[column].mean()
    data[column] /= m.sqrt(data[column].std())
    return data

# Part 2 Bootstrapping
I wanted to try to just bootstrap an enormous data set and see how it was able to fit the data.

In [8]:
filename = 'train.csv'
filepath = ''
data = pd.read_csv(filepath + filename)

In [9]:
new_data = pipeline(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
X_boot = np.zeros((10**6, new_data.shape[1]))

In [21]:
for i in range(X_boot.shape[0]):
    #seed = np.random.randint(0,100)
    rand = np.random.randint(0, new_data.shape[0])
    X_boot[i][:] = new_data.iloc[rand,:]

In [36]:
labels = X_boot[:,0]
features = X_boot[:,1:]

### Logistic Regression

In [30]:
accuracies = [] 
weights_L1 = []
for i in range(10):
    rand = np.random.randint(1, 100)
    X_train, X_test, y_train, y_test = train_test_split(features, labels)
    clf = LogisticRegression(penalty='l1', max_iter=1000, random_state=rand)
    _ = clf.fit(X_train, y_train)
    accuracies.append(clf.score(X_test, y_test))
    weights_L1.append(clf.coef_)

accuracies = np.array(accuracies)
print("Mean for L1 norm is: {}".format(np.mean(accuracies, axis=0)))
print("St Dev for L1 norm is: {}".format(np.std(accuracies, axis=0)))
print('')

Mean for L1 norm is: 0.9414912
St Dev for L1 norm is: 0.00041889683694198987



In [31]:
_ = clf.fit(features, labels)

In [35]:
test_features = getTestFeatures()
preds = clf.predict(test_features)
pred = makeSubmission(preds)
filename = 'prediction_bootstrap_log.csv'
pred.to_csv(filename, encoding='utf-8', index=False)

Logistic Regression doesnt seem to work very well ever on this data set. I'm going to retire it.

### Gradient Boosting

In [35]:
new_data.columns

Index(['Y', 'F3', 'F4', 'F5', 'F6', 'F7', 'F10', 'F16', 'F17', 'F19'], dtype='object')

In [38]:
clf = GradientBoostingClassifier(loss='exponential', learning_rate=0.1, 
                                 n_estimators=500, max_depth=6, subsample=0.5)
_ = clf.fit(features, labels)
test_data = pd.read_csv('test.csv')
test_data = testPipeline(test_data)
pred = clf.predict(test_data)
makeFile(pred, 'prediction-pipelined-bootstrapped-gradboost.csv')