In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # typically done at the start of the script
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso


from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import KFold
# from sklearn.model_selection import cross_val_score
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.decomposition import PCA
# from sklearn.neural_network import MLPClassifier
# import seaborn as sns # for visualiation
# from sklearn.datasets import make_blobs
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import confusion_matrix






In [2]:
## wrangled data
data = pd.read_csv('./data/funding_and_scores.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,State,year,Number.of.districts,Minimum,Maximum,Mean,score
0,1,Alabama,1995,127,5691657.6,405581866.2,41016630.0,0.701875
1,2,Alabama,2000,128,7786867.2,551756732.8,53738210.0,0.695625
2,3,Alabama,2005,131,6607702.2,661439354.4,54638430.0,0.000294
3,4,Alabama,2010,132,6317688.3,631356264.0,62901270.0,0.000282
4,5,Alabama,2014,136,5017000.0,580513000.0,56261820.0,0.000281


In [4]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df

def pre_process_data(df):
    print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    

#     # match test set and training set columns
#     if enforce_cols is not None:
#         to_drop = np.setdiff1d(df.columns, enforce_cols)
#         to_add = np.setdiff1d(enforce_cols, df.columns)

#         df.drop(to_drop, axis=1, inplace=True)
#         df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)
    
    return df

In [5]:
## training vs testing data
test_data = data[data['year'] == 2014]
training_data = data[data['year'] != 2014]

In [7]:
train_feature = pre_process_data(data.drop(['State','score'], axis=1))
train_label = np.ravel(data.score)

Input shape:	(260, 6)
After standardization (260, 6)
After converting categoricals:	(260, 188)


In [9]:
train_feature.head()

Unnamed: 0.1,Unnamed: 0,year,Minimum,Maximum,Mean,Number.of.districts_1,"Number.of.districts_1,005","Number.of.districts_1,023","Number.of.districts_1,030","Number.of.districts_1,062",...,Number.of.districts_823,Number.of.districts_847,Number.of.districts_89,Number.of.districts_892,Number.of.districts_916,Number.of.districts_94,Number.of.districts_97,Number.of.districts_98,Number.of.districts_983,Number.of.districts_995
0,-1.722081,-1.439648,-0.148117,-0.362987,-0.202067,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1.708783,-0.705134,-0.141419,-0.328042,-0.161824,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1.695485,0.029381,-0.145189,-0.301822,-0.158976,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1.682187,0.763895,-0.146116,-0.309013,-0.132838,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1.668889,1.351506,-0.150273,-0.321168,-0.153841,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# ## Split training data to training and validation
# train_small_features, validation_features, train_small_outcome, validation_outcome = train_test_split(train_features, train_outcome, test_size = 0.2, random_state = 11)
# train_small_features.head()

In [11]:
clf = linear_model.Lasso(alpha=0.1)

In [12]:
### logistic regression ###
param_grid = {'fit_intercept': np.arange(1,2) }

In [13]:
clf.fit(train_feature, train_label)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [14]:
train_feature.head()

Unnamed: 0.1,Unnamed: 0,year,Minimum,Maximum,Mean,Number.of.districts_1,"Number.of.districts_1,005","Number.of.districts_1,023","Number.of.districts_1,030","Number.of.districts_1,062",...,Number.of.districts_823,Number.of.districts_847,Number.of.districts_89,Number.of.districts_892,Number.of.districts_916,Number.of.districts_94,Number.of.districts_97,Number.of.districts_98,Number.of.districts_983,Number.of.districts_995
0,-1.722081,-1.439648,-0.148117,-0.362987,-0.202067,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1.708783,-0.705134,-0.141419,-0.328042,-0.161824,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1.695485,0.029381,-0.145189,-0.301822,-0.158976,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1.682187,0.763895,-0.146116,-0.309013,-0.132838,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1.668889,1.351506,-0.150273,-0.321168,-0.153841,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
type(train_label)

numpy.ndarray

In [15]:
grid_search = GridSearchCV(Lasso(), param_grid, cv=10, scoring=make_scorer(log_loss))
grid_search.fit(train_feature, train_label)

ValueError: Unknown label type: (array([  7.01875000e-01,   6.95625000e-01,   2.93576389e-04,
         2.81770833e-04,   2.80555556e-04,   6.46250000e-01,
         6.40000000e-01,   2.65104167e-04,   2.62673611e-04,
         2.59375000e-04,   6.53750000e-01,   6.55000000e-01,
         2.70138889e-04,   2.67187500e-04,   2.69444444e-04,
         6.97500000e-01,   6.95000000e-01,   2.96701389e-04,
         2.93750000e-04,   2.93055556e-04,   6.28750000e-01,
         6.34375000e-01,   2.63888889e-04,   2.62673611e-04,
         2.59027778e-04,   6.71250000e-01]),)

In [None]:
# grid_search.fit(train_small_features, train_small_outcome)

In [None]:
knc = KNeighborsClassifier()

param_grid_knc = {'kneighborsclassifier__n_neighbors': np.arange(1,10)}

rfc = RandomForestClassifier()

param_grid_rfc = {'randomforestclassifier__n_estimators': np.arange(1,10)}

dtc = DecisionTreeClassifier()

param_grid_dtc = {'decisiontreeclassifier__max_depth': np.arange(1,10)}

nn = MLPClassifier() 

param_grid_clf = {'mlpclassifier__hidden_layer_sizes': np.arange(1,12),

                  'mlpclassifier__activation': ['identity', 'logistic', 'tanh', 'relu']}

nn.get_params().keys()

In [None]:
def run_model(model, param_grid, xtrain, ytrain, do_pca = False):
    if(do_pca == True):
        pca = PCA(n_components = 10)
        scaler = MinMaxScaler()
        pipe = make_pipeline(pca, model)
        grid = GridSearchCV(pipe,param_grid)
        grid.fit(xtrain, ytrain)
        grid.best_params_
        accuracy = grid.score(xtrain, ytrain)
        print(f"In-sample accuracy: {accuracy:0.2%}")
        return(grid)
    scaler = MinMaxScaler()
    pipe = make_pipeline(model)
    grid = GridSearchCV(pipe,param_grid)
    grid.fit(xtrain, ytrain)
    grid.best_params_
    accuracy = grid.score(xtrain, ytrain)
    print(f"In-sample accuracy: {accuracy:0.2%}")
    return(grid)

In [None]:
reg = linear_model.Lasso()

param_grid_reg = {'lasso__alpha': np.arange(0.05,0.2),
                  'lasso__copy_X': [True, False],
                 'lasso__fit_intercept': [True, False],
                 'lasso__max_iter': np.arange(500, 2000),
                 'lasso__normalize': [True, False],
                 'lasso__positive': [True, False],
                 'lasso__precompute': [True, False],
                 'lasso__selecton': ['cyclic'],
                 'lasso__warm_start': [True, False]}

In [None]:
run_model(reg, param_grid_reg, train_small_features, train_small_outcome, True)

In [None]:
from sklearn.linear_model import ElasticNetCV
regr = ElasticNetCV()
param_grid_regr = {'elasticnetcv__copy_X': [True, False],
                 'elasticnetcv__fit_intercept': [True, False],
                 'elasticnetcv__max_iter': np.arange(500, 2000),
                 'elasticnetcv__normalize': [True, False],
                 'elasticnetcv__cv': np.arange(5,10),
                  'elasticnetcv__random_state': np.arange(0,1),
                 'elasticnetcv__n_jobs': np.arange(1,5),
                 'elasticnetcv__selecton': ['cyclic']}

In [None]:
run_model(regr, param_grid_regr, train_small_features, train_small_outcome, do_pca = False)

In [None]:
est