In [57]:
#Import block:
import nltk, re, pprint
import numpy as np 
import pandas as pd
import csv
import time
from timeit import *
#Metrics and testing:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
#Models:
from sklearn import linear_model 

from scipy import stats
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn import neighbors
import heapq

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import metrics

from sklearn import svm
from sklearn.model_selection import GridSearchCV


In [2]:
#Reading in data:
path = "."
#local paths
csv_path = path + "/background.csv"
train_path = path + "/train.csv"
meta_path = path + "/FFMetadata20180221.csv"
pred_path = path + "/prediction.csv"

In [3]:
background_csv = pd.read_csv(csv_path, low_memory=False)
train_csv = pd.read_csv(train_path)
meta_csv = pd.read_csv(meta_path, low_memory=False)

# Fix date bug
background_csv.cf4fint = ((pd.to_datetime(background_csv.cf4fint) - pd.to_datetime('1960-01-01')) / np.timedelta64(1, 'D')).astype(int)

In [4]:
#Examine Information
print(background_csv.shape)
print background_csv.info(), "\n"
print(train_csv.shape)
print(train_csv.info())
#note that object tends to be any data type mixed with NA/str/etc.

(4242, 13027)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4242 entries, 0 to 4241
Columns: 13027 entries, challengeID to k5f1
dtypes: float64(544), int64(12368), object(115)
memory usage: 421.6+ MB
None 

(2121, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2121 entries, 0 to 2120
Data columns (total 7 columns):
challengeID         2121 non-null int64
gpa                 1165 non-null float64
grit                1418 non-null float64
materialHardship    1459 non-null float64
eviction            1459 non-null float64
layoff              1277 non-null float64
jobTraining         1461 non-null float64
dtypes: float64(6), int64(1)
memory usage: 116.1 KB
None


In [5]:
#remove constant variables
constants = np.genfromtxt(path + "/constantVariables.txt", dtype=str)
variables = background_csv.axes[1]
not_constants = [i for i, v in enumerate(variables) if v not in constants]
background_trim = background_csv.iloc[:,not_constants]
print(background_trim.shape)
background_trim.info()

(4242, 10595)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4242 entries, 0 to 4241
Columns: 10595 entries, challengeID to k5f1
dtypes: float64(541), int64(9940), object(114)
memory usage: 342.9+ MB


In [8]:
na_bool = background_trim[background_trim.axes[1][0]].isna()
print(type(na_bool))
#background_trim[background_trim.axes[1][0]]

<class 'pandas.core.series.Series'>


In [7]:
#Find variables that are 80% or more neg / NA (missing/blank) values
variables = list(background_trim.axes[1])
toRemove = []
count = 0 
for var in variables:
    na_bool = background_trim[var].isna()
    na_count = sum(na_bool)
    
    neg_bool = background_trim[var].lt(0)
    neg_count = neg_bool.sum(skipna=True)
    
    total = len(na_bool)
    if float(na_count + neg_count) / float(total) > 0.80:
        count += 1
        toRemove.append(var)
print(count)

5336


In [8]:
#Remove variables that are 80% or more neg / NA (missing/blank) values
background_trim_2 = background_trim.copy(deep=True)
print(background_trim_2.shape)
background_trim_2.drop(toRemove, axis=1, inplace=True)
print(background_trim_2.shape)

(4242, 10595)
(4242, 5259)


In [9]:
#Filtering for constructed variables (most of which begin with 'c'):
background_c = background_trim_2.filter(regex = '^c', axis = 1)
print(background_c.shape)
background_c.info()

(4242, 484)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4242 entries, 0 to 4241
Columns: 484 entries, challengeID to cf5samp
dtypes: float64(93), int64(384), object(7)
memory usage: 15.7+ MB


In [10]:
#Determine how many na/nan, negative values there are in constructed variable set

counts = background_c.count(1).values
total = background_c.shape[0]*background_c.shape[1] #background_c.size
nan_count = total - sum(counts)

print "Total: ", total
print "NaN: ", nan_count
print "NaN Fraction:", float(nan_count)/float(total)

counts = background_c._get_numeric_data()
#print(counts.shape)
neg_count = counts.lt(0).sum(skipna=True).sum()
print "Negative: ", neg_count
print "Negative Fraction:", float(neg_count)/float(total)

#stuff = background_c.loc[:,'cf5samp'].values
#stuff = [1 for x in stuff if x < 0]
#sum(stuff)

Total:  2053128
NaN:  10402
NaN Fraction: 0.00506641573248
Negative:  638907
Negative Fraction: 0.311187125206


In [11]:
#Collect metadata
meta_csv.info()
meta_csv.axes
meta_type = meta_csv.loc[:,['new_name','type']]
meta_type.iloc[0:10,:]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16782 entries, 0 to 16781
Columns: 134 entries, new_name to label58
dtypes: float64(59), int64(3), object(72)
memory usage: 17.2+ MB


Unnamed: 0,new_name,type
0,idnum,ID Number
1,cf1intmon,uc
2,cf1intyr,cont
3,cf1lenhr,cont
4,cf1lenmin,cont
5,cf1twoc,bin
6,cf1fint,bin
7,cf1natsm,bin
8,f1natwt,cont
9,cf1natsmx,bin


In [12]:
#Check if there are any variables in background.csv that are not in metadata
meta_dict = {}
for i in range(meta_type.shape[0]):
    meta_dict[meta_type['new_name'][i]] = meta_type['type'][i]

count = 0
for var in background_csv.axes[1]:
    if var not in meta_dict:
        count += 1

print(count)

26


In [13]:
#make a deep copy of constructed variables df
background_c_imputed = background_c.copy(deep=True)

In [14]:
#Fill in Missing Data in Constructed Variable Set (isna / lt(0))
variables = list(background_c_imputed.axes[1][1:])
#print(background_c_imputed[variables[0]])

toRemove = []
for i, var in enumerate(variables):
    #if i < 375:
    #    continue
    if i % 25 == 0:
        print "round" + str(i)
    if var in meta_dict:
        if meta_dict[var] == 'bin' or meta_dict[var] == 'uc':
            #print "bin"
            maximum = background_c_imputed.loc[:,var].max()
            _mode = background_c_imputed.loc[background_c_imputed.loc[:,var] >= 0,var].mode()[0]
            background_c_imputed.loc[:,var].fillna(_mode)
            
            #Transform Neg Entries to Unordered Categorical Values
            for i in range(len(list(background_c_imputed.loc[:,var]))):
                if background_c_imputed.loc[i,var] == -7 or background_c_imputed.loc[i,var] == -8:
                    background_c_imputed.loc[i,var] = maximum + 1
                elif background_c_imputed.loc[i,var] == -6:
                    background_c_imputed.loc[i,var] = maximum + 2
                elif background_c_imputed.loc[i,var] == -2:
                    background_c_imputed.loc[i,var] = maximum + 3
                elif background_c_imputed.loc[i,var] == -1:
                    background_c_imputed.loc[i,var] = maximum + 4
                elif background_c_imputed.loc[i,var] < 0:
                    background_c_imputed.loc[i,var] = _mode
                    
        elif meta_dict[var] == 'string':
            #print "string"
            #DIY Label-Encoding
            #print(var)
            
            #treat as a continuous variable if necessary
            #if background_c_imputed.loc[:,var].dtype == float
            
            unique_values = list(background_c_imputed.loc[:,var].unique())
            unique_strings = [x for x in unique_values if type(x) == str or x > 0]
            unique_dict = {}
            for i, x in enumerate(unique_strings):
                unique_dict[x] = i
                
            #DIY mode
            all_values = list(background_c_imputed.loc[:,var].values)
            all_strings = [x for x in all_values if type(x) == str or x > 0]
            _mode = stats.mode(all_strings)[0][0]
                
            if len(unique_strings) < 11:
                maximum = len(unique_strings) - 1
                background_c_imputed.loc[:,var].fillna(_mode)
                
                #Transform Neg/String Entires to Unordered Categorical Values
                for i in range(len(list(background_c_imputed.loc[:,var]))):
                    if background_c_imputed.loc[i,var] == -7 or background_c_imputed.loc[i,var] == -8:
                        background_c_imputed.loc[i,var] = maximum + 1
                    elif background_c_imputed.loc[i,var] == -6:
                        background_c_imputed.loc[i,var] = maximum + 2
                    elif background_c_imputed.loc[i,var] == -2:
                        background_c_imputed.loc[i,var] = maximum + 3
                    elif background_c_imputed.loc[i,var] == -1:
                        background_c_imputed.loc[i,var] = maximum + 4
                    elif background_c_imputed.loc[i,var] < 0:
                        background_c_imputed.loc[i,var] = unique_dict[_mode]
                    else:
                        background_c_imputed.loc[i,var] = unique_dict[background_c_imputed.loc[i,var]]
                
            else:
                toRemove.append(var)            
            
        elif meta_dict[var] == 'oc':
            #print "oc"
            _mode = background_c_imputed.loc[(background_c_imputed.loc[:,var] >= 0) & (background_c_imputed.loc[:,var] < 200), var].mode()[0]
            background_c_imputed.loc[:,var].fillna(_mode)
            
            #Transform Neg Entries to Mode Ordered Categorical Value
            for i in range(len(background_c_imputed[var])):
                if background_c_imputed.loc[i,var] < 0 or background_c_imputed.loc[i,var] >= 200:
                    background_c_imputed.loc[i,var] = _mode
            
            
        elif meta_dict[var] == 'cont':
            #print "cont"
            _mean = background_c_imputed.loc[background_c_imputed.loc[:,var] >= 0, var].mean()
            background_c_imputed.loc[:,var].fillna(_mean)
            
            #Transform Neg Entries to Mean Continuous Value
            for i in range(len(list(background_c_imputed.loc[:,var]))):
                if background_c_imputed.loc[i,var] < 0:
                    background_c_imputed.loc[i,var] = _mean
            
        else:
            print(meta_dict[var])
    else:
        #cannot discern type, set to mode
        print "nope"

round0
round25
round50
round75
round100
round125
round150
round175
round200
round225
round250
round275
round300
round325
round350
round375
round400
round425
round450
round475


Below we print out a brief subset of constructed variable data before imputation, and afterwards
to get a sense of how the data gets cleaner with respect to different varaiable data types.

In [15]:
meta = [meta_dict[var] for var in variables[0:10]]
print meta 
background_c.loc[0:10,variables[0:10]]

['cont', 'cont', 'bin', 'bin', 'cont', 'bin', 'bin', 'cont', 'cont', 'bin']


Unnamed: 0,cf1lenhr,cf1lenmin,cf1fint,cf1citsm,cf1age,cf1marm,cf1cohm,cf1adult,cf1kids,cf1gdad
0,-9,-9,0,-9,-9,-9,-9,-9,-9,-9
1,0,40,1,1,-3,0,0,1,0,0
2,0,45,1,1,24,1,0,5,1,1
3,0,45,1,1,24,0,1,2,0,0
4,-6,50,1,1,19,0,0,5,0,0
5,0,30,1,1,20,0,1,4,2,1
6,0,45,1,1,-3,0,1,3,0,0
7,-3,45,1,1,26,0,0,4,0,1
8,-3,55,1,1,24,1,0,2,1,0
9,0,45,1,1,34,0,1,2,0,0


In [22]:
background_c_imputed.loc[0:10,variables[0:10]]

Unnamed: 0,cf1lenhr,cf1lenmin,cf1fint,cf1citsm,cf1age,cf1marm,cf1cohm,cf1adult,cf1kids,cf1gdad
0,0.178152,36.753973,0,1,27.92677,0,0,2.404432,0.956615,0
1,0.0,40.0,1,1,27.92677,0,0,1.0,0.0,0
2,0.0,45.0,1,1,24.0,1,0,5.0,1.0,1
3,0.0,45.0,1,1,24.0,0,1,2.0,0.0,0
4,0.178152,50.0,1,1,19.0,0,0,5.0,0.0,0
5,0.0,30.0,1,1,20.0,0,1,4.0,2.0,1
6,0.0,45.0,1,1,27.92677,0,1,3.0,0.0,0
7,0.178152,45.0,1,1,26.0,0,0,4.0,0.0,1
8,0.178152,55.0,1,1,24.0,1,0,2.0,1.0,0
9,0.0,45.0,1,1,34.0,0,1,2.0,0.0,0


In [16]:
toRemove

['ch5dspr',
 'ch5ppvtae',
 'ch5ppvtpr',
 'ch5wj9pr',
 'ch5wj9ae',
 'ch5wj10pr',
 'ch5wj10ae',
 'ch5dsae']

In [17]:
#these guys are potentially annoying, they should be labelled continuous but they're strings
#it is difficult to determine if any particular row has this awkward labeling so let's just throw them out  
pattern = re.compile(".*pr$")
for key in meta_dict:
    if pattern.match(key):
        print key

ch5wj9pr
ch5dspr
ch5wj10pr
ch5ppvtpr


In [18]:
print(background_c_imputed.shape)
background_c_imputed.drop(toRemove, axis=1, inplace=True)
print(background_c_imputed.shape)

(4242, 484)
(4242, 476)


In [22]:
#extract unordered categorical variables (uc, bin, string) and cont/oc vars (euclidean)

variables = list(background_c_imputed.axes[1][1:])
uc_columns = [var for var in variables if meta_dict[var] in ['uc','bin','string']]
other_columns = [var for var in variables if meta_dict[var] in ['cont','oc']]
print(len(variables))
print(len(uc_columns))
print(len(other_columns))

#Note we get rid of the challengeID column here as a result of excluding it from variabes 
#list we gerenated previously.  Therefore we go from 476 variables above to 475 total

475
262
213


In [23]:
uc_df = background_c_imputed.filter(items=uc_columns)
print(uc_df.shape)
other_df = background_c_imputed.filter(items=other_columns)
print(other_df.shape)

(4242, 262)
(4242, 213)


In [27]:
print list(uc_df.axes[1][0:10])
print uc_columns[0:10]

['cf1fint', 'cf1citsm', 'cf1marm', 'cf1cohm', 'cf1gdad', 'cf1gmom', 'cf1ethrace', 'cf1hhimp', 'cf1finjail', 'cf1tele']
['cf1fint', 'cf1citsm', 'cf1marm', 'cf1cohm', 'cf1gdad', 'cf1gmom', 'cf1ethrace', 'cf1hhimp', 'cf1finjail', 'cf1tele']


In [24]:
#Look at distribution of spread and magnitude for each cont variable
spread = []
mean = []
minimum = []
maximum = []
sd = []
for var in other_df.axes[1]:
    spread.append(other_df.loc[:,var].max() - other_df.loc[:,var].min())
    mean.append(other_df.loc[:,var].mean())
    minimum.append(other_df.loc[:,var].min())
    maximum.append(other_df.loc[:,var].max())
    sd.append(other_df.loc[:,var].std())
    
#plt.plot([i for i in range(len(spread))],spread, 'ro')
#plt.plot([i for i in range(len(minimum))],minimum, 'bo')
#plt.semilogy([i for i in range(len(maximum))],maximum, 'go')
#plt.show()

In [25]:
#Max Min Scale cont/oc data to [0,1] range for better coeff interpretability
other_mat = other_df.as_matrix()
min_max_scaler = preprocessing.MinMaxScaler()
other_out = min_max_scaler.fit_transform(other_mat)

#Collect UC data, no re-scaling necessary
uc_mat = uc_df.as_matrix()

In [34]:
#Perform one-hot encoding on uc_mat to make uc data type compatible with machine learning classifiers
#will increase number of "features" by some small factor, but worth it for semantic reasons
#IMPORTANT: ASSUMES EACH FEATURE TAKES ON VALUES IN RANGE(0, n_values), SO WHILE MOST OF
#OUR CATEGORICAL VARIABLES DO ADHERE TO THIS SOME DONT START AT 0, RESULT IS A FEW MORE
#

print(uc_df.nunique().sum()) #should be resulting size of one hot matrix

#Also I need to somehow map indexes in expanded one-hot form back to original
#uc_columns index to associate variables with important "one-hot features"
indexToVar = []
for i, var in enumerate(uc_columns):
    for j in range(uc_df[var].nunique()):
        indexToVar.append(i)

print len(indexToVar)
        
enc = preprocessing.OneHotEncoder()
enc.fit(uc_mat)
uc_out = enc.transform(uc_mat)
print uc_out.shape


642
642
(4242, 642)


In [35]:
#Combine both types of data
new_matrix = []

for i in range(other_out.shape[0]):
    temp = []
    for j in range(uc_out.shape[1]):
        temp.append(uc_out[i,j])
    for j in range(other_out.shape[1]):
        temp.append(other_out[i,j])
    new_matrix.append(temp)
    
new_matrix = np.matrix(new_matrix)
print new_matrix.shape

(4242, 855)


In [36]:
#Separate Training and Testing Data

#Dropping rows with only NAs in training data
train_noNARows = train_csv.dropna(thresh=2)
#Grabbing the unique IDs of participants in the training set:
train_IDs = list(train_noNARows.iloc[:,0])

#I'm pretty sure that the range index for this data goes from 1 to 4242 as does 'challengeID'
#However, since we have converted everything to a matrix, we are now indexing
#From 0 to 4241 in the training data
train_IDs = [x - 1 for x in train_IDs] 
print len(train_IDs)

#Grab rows in processed data corresponding to unique IDs of valid training labels
train_data = new_matrix[train_IDs]

#Get all testing data
test_data = new_matrix
print test_data.shape


1466
(4242, 855)


In [37]:
print(list(train_noNARows.iloc[:,0])[0:10])

[6, 7, 9, 10, 13, 14, 16, 18, 20, 23]


In [38]:
train_noNARows.iloc[0:10,:]

Unnamed: 0,challengeID,gpa,grit,materialHardship,eviction,layoff,jobTraining
2,6,,3.5,0.090909,0.0,0.0,0.0
3,7,2.5,3.25,0.0,0.0,0.0,0.0
5,9,2.25,4.0,0.181818,0.0,0.0,0.0
6,10,3.25,3.25,0.090909,0.0,,0.0
7,13,2.75,4.0,0.181818,0.0,0.0,1.0
8,14,3.25,2.75,0.272727,0.0,1.0,0.0
9,16,2.0,3.5,0.090909,0.0,0.0,1.0
10,18,2.25,3.0,0.0,0.0,1.0,0.0
11,20,,3.0,0.0,0.0,0.0,0.0
12,23,2.5,3.25,0.0,0.0,0.0,0.0


In [39]:
#Compute average NA proportion of training labels

count = 0.0
for i in range(6):
    count += float(train_noNARows.shape[0] - train_noNARows.iloc[:,i+1].count()) / float(train_noNARows.shape[0])
count / float(6)

#Even though 6% is tolerable, we should not run label vector with NaN inside our classifiers.

0.06332423829013187

In [40]:
#We will impute training labels with basic mean (cont outcomes) and mode (binary outcomes)
#We might come back to this and do sophisticated KNN based imputation but 
#It is really not worth the work

col_labels=['challengeID','gpa','grit','materialHardship','eviction','layoff','jobTraining']
cont_labels = ['gpa','grit','materialHardship']
bin_labels = ['eviction','layoff','jobTraining']

values = {}

for label in col_labels:
    if label in cont_labels:
        values[label] = train_noNARows.loc[:,label].mean()
    elif label in bin_labels:
        values[label] = int(train_noNARows.loc[:,label].mode())
    else:
        values[label] = 0

print values

{'layoff': 0, 'grit': 3.4275387870239773, 'materialHardship': 0.10374478160633066, 'jobTraining': 0, 'gpa': 2.8667381974248927, 'challengeID': 0, 'eviction': 0}


In [41]:
train_noNARows = train_noNARows.fillna(value=values)
train_noNARows.iloc[0:10,:]

Unnamed: 0,challengeID,gpa,grit,materialHardship,eviction,layoff,jobTraining
2,6,2.866738,3.5,0.090909,0.0,0.0,0.0
3,7,2.5,3.25,0.0,0.0,0.0,0.0
5,9,2.25,4.0,0.181818,0.0,0.0,0.0
6,10,3.25,3.25,0.090909,0.0,0.0,0.0
7,13,2.75,4.0,0.181818,0.0,0.0,1.0
8,14,3.25,2.75,0.272727,0.0,1.0,0.0
9,16,2.0,3.5,0.090909,0.0,0.0,1.0
10,18,2.25,3.0,0.0,0.0,1.0,0.0
11,20,2.866738,3.0,0.0,0.0,0.0,0.0
12,23,2.5,3.25,0.0,0.0,0.0,0.0


In [42]:
#Extract and Separate labels accordingly

train_labels = {}
train_labels['gpa'] = train_noNARows['gpa'].as_matrix()
train_labels['grit'] = train_noNARows['grit'].as_matrix()
train_labels['materialHardship'] = train_noNARows['materialHardship'].as_matrix()
train_labels['eviction']= train_noNARows['eviction'].as_matrix()
train_labels['layoff'] = train_noNARows['layoff'].as_matrix()
train_labels['jobTraining'] = train_noNARows['jobTraining'].as_matrix()

#print(gpa_label[0:10])
#print(len(gpa_label))

In [44]:
#It is useful to see how skewed the binary labels are (most of the labels are 0.0)
#so we should subsample our training data accordingly for a more balanced model

print(np.mean(train_labels['eviction']))
print(np.mean(train_labels['layoff']))
print(np.mean(train_labels['jobTraining']))



0.059345156889495224
0.18212824010914053
0.2339699863574352


In [45]:
# #Let's see what testing on only one hot or only cont data is like
# #cont_data = other_out[train_IDs]
# #hot_data = uc_out[train_IDs]

# #Let's see what balancing out binary labels can do for logistic regression
# def balance_data(label,factor):
    
#     #extract positive vs negative label IDs
#     train_IDs_1 = train_noNARows.loc[train_noNARows[label] == 1.0,'challengeID']
#     train_IDs_1 = [x - 1 for x in train_IDs_1] 
#     train_IDs_0 = train_noNARows.loc[train_noNARows[label] == 0.0,'challengeID']
#     train_IDs_0 = [x - 1 for x in train_IDs_0] 

#     balanced_IDs = list(train_IDs_1)
#     balanced_IDs.extend(train_IDs_0[0:int(factor*len(train_IDs_1))])
#     balanced_IDs.sort()
#     balanced_data = new_matrix[balanced_IDs]
#     balanced_IDs = [x + 1 for x in balanced_IDs]
#     balanced_label = train_noNARows.loc[train_noNARows['challengeID'].isin(balanced_IDs),label].as_matrix()
#     return balanced_data, balanced_label

# #confirm we pulled right amount out
# #print(len(balanced_IDs)) #2.5*len(train_IDs_1)
# #print(balanced_job.shape)

In [46]:
# def logistic_regression(label, factor, regularizer, solver_):
#     balanced_data, balanced_label = balance_data(label,factor)
#     clf = linear_model.LogisticRegressionCV(cv=5, penalty=regularizer, solver=solver_)
#     clf.fit(balanced_data,balanced_label)
#     print("Logistic Regression with l2 regularization on {}".format(label))
#     print("Proportion of 1 (Positive) Labels: {}".format(np.mean(balanced_label)))
#     print("Brier Loss Score: {}".format(metrics.brier_score_loss(train_labels[label], prediction[:,1])))

# #Best label proportion for job prediction is 40% i.e. 1.5*train_IDs_1
# clf = logistic_regression('jobTraining','l2','lbfgs')

In [50]:
#Purpose of Dual and Intercept_Scaling Parameters
#dual is only used for l2 regularization with liblinear solver
#intercept_scaling useful only when liblinear solver is used

#IMPORTANT BY SELECTING class_weight='balanced' I am forcing LogRegCV to correct
#Weights via inverse scaling wrt class frequency - aka I am removing huge 0-label bias in the data
#This means my K-Fold CV Brier Loss will look weaker on my training set, but as long as the 
#leaderboard / held-out test samples are more balanced, I foresee stronger classification 

def feedToBrier(y_true, y_prob):
    return metrics.brier_score_loss(y_true, y_prob[:,1])

scorer_ = metrics.make_scorer(feedToBrier, greater_is_better=False,needs_proba=True)

def logistic_regression(label, regularizer, solver_):
    clf = linear_model.LogisticRegressionCV(Cs=10, fit_intercept=True, cv=5, dual=False, penalty=regularizer, 
                                            scoring=scorer_, solver=solver_, max_iter=100, tol=0.0001, class_weight='balanced', 
                                            n_jobs=1, verbose=0, refit=True, intercept_scaling=1.0, multi_class='ovr', 
                                            random_state=None)
    clf.fit(train_data,train_labels[label])
    return clf

# Purpose of Refit Parameter very Important
# If set to True, the scores are averaged across all folds, and 
# the coefs and the C that corresponds to the best score is taken, 
# and a final refit is done using these parameters. Otherwise the coefs, 
# intercepts and C that correspond to the best scores across folds are averaged.
    
    

In [51]:
def print_logistic(clf, label, regularizer):
    print("Logistic Regression with {} regularization on {} \n".format(regularizer, label))
    print "Regularization Constants"
    print clf.Cs_, "\n"
    print "Average Brier Loss per Regularization Constant Over All Folds"
    scores = []
    for i in range(len(clf.scores_[1.0][0])):
        scores.append(-1.0*max(clf.scores_[1.0][:,i]))
    print scores, "\n"
    print "Smallest Brier Loss Over All Folds and Choices of C"
    print np.min(scores), "\n"
    #print clf_job_l2.scores_
    print "Most Informative Features"
    informative = np.argsort(clf.coef_[0])
    print informative[-10:], "\n"
    print "Best Score Across Every Class"
    print clf.C_, "\n"

In [362]:
#Run Logistic Regression with Built In Cross Validation to Select the Best Regularization Weight / Coefficients with respect to Brier Loss Metric

clf_job_l2 = logistic_regression('jobTraining','l2','lbfgs')

In [372]:
#Cs array = inverse of regularization strength
#Smaller C tended to perform better, aka, stronger regularization has better KFold_CV performance
#This is not surprising
print_logistic(clf_job_l2,'jobTraining','l2')


Logistic Regression with l2 regularization on jobTraining 

Regularization Constants
[1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04] 

Average Brier Loss per Regularization Constant Over All Folds
[0.2489887958606962, 0.24483242191946783, 0.23574889776924607, 0.23322363013463204, 0.24501988053299772, 0.26864954799413165, 0.29976206244822967, 0.3152329881716158, 0.31545167166225463, 0.3163462949408531] 

Smallest Brier Loss Over All Folds and Choices of C
0.23322363013463204 

Most Informative Features
[849 416 576  81  49 852 343 657  13 844] 

Best Score Across Every Class
[0.00599484] 



In [357]:
#Run Logistic Regression with Built In Cross Validation to Select the Best Regularization Weight / Coefficients with respect to Brier Loss Metric

clf_job_l1 = logistic_regression('jobTraining','l1','saga')

In [373]:
#L1 performance is comparable with L2, it is a stronger suppressor on feature weights
#So it has more consistent performance independent of size of tuning parameter C
#I suspect, because of the stronger suppression, the most important features for L1 hold a lot of weight

print_logistic(clf_job_l1,'jobTraining','l1')

Logistic Regression with l1 regularization on jobTraining 

Regularization Constants
[1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04] 

Average Brier Loss per Regularization Constant Over All Folds
[0.24334496852377088, 0.2429775791843483, 0.2433313830559145, 0.24216966591364272, 0.23604126884997162, 0.260388988831333, 0.2706073259208015, 0.27911259535760036, 0.2847666200666093, 0.2884602484259553] 

Smallest Brier Loss Over All Folds and Choices of C
0.23604126884997162 

Most Informative Features
[445 176 841 160 343 852  13 814 844 797] 

Best Score Across Every Class
[0.35938137] 



In [365]:
#Run Logistic Regression with Built In Cross Validation to Select the Best Regularization Weight / Coefficients with respect to Brier Loss Metric

clf_evict_l2 = logistic_regression('eviction','l2','lbfgs')

In [374]:
print_logistic(clf_evict_l2,'eviction','l2')

Logistic Regression with l2 regularization on eviction 

Regularization Constants
[1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04] 

Average Brier Loss per Regularization Constant Over All Folds
[0.2430910979029625, 0.22398284347848366, 0.18025506807649477, 0.13906519730674904, 0.12093011656108776, 0.11140713988924242, 0.10988015855989364, 0.11211314835231886, 0.11439616189325086, 0.11660255341528108] 

Smallest Brier Loss Over All Folds and Choices of C
0.10988015855989364 

Most Informative Features
[722 407 637 764 674 804 805 329 461 476] 

Best Score Across Every Class
[2.7825594] 



In [367]:
#Run Logistic Regression with Built In Cross Validation to Select the Best Regularization Weight / Coefficients with respect to Brier Loss Metric

clf_evict_l1 = logistic_regression('layoff','l1','saga')


In [375]:
print_logistic(clf_evict_l1,'evict','l1')

Logistic Regression with l1 regularization on evict 

Regularization Constants
[1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04] 

Average Brier Loss per Regularization Constant Over All Folds
[0.23620668647166956, 0.2404228245097979, 0.2448083859462886, 0.24504396435429357, 0.23553579207940767, 0.2440468425568228, 0.2523364804601218, 0.2576090641235622, 0.26101686301991345, 0.26379461580524155] 

Smallest Brier Loss Over All Folds and Choices of C
0.23553579207940767 

Most Informative Features
[569 382 743  13 264  65 845 176 637 686] 

Best Score Across Every Class
[0.35938137] 



In [376]:
#Run Logistic Regression with Built In Cross Validation to Select the Best Regularization Weight / Coefficients with respect to Brier Loss Metric

clf_layoff_l2 = logistic_regression('layoff','l2','lbfgs')


In [377]:
print_logistic(clf_layoff_l2,'layoff','l2')

Logistic Regression with l2 regularization on layoff 

Regularization Constants
[1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04] 

Average Brier Loss per Regularization Constant Over All Folds
[0.24810942608069655, 0.24423322861418692, 0.2398930384295608, 0.23079195590981844, 0.23187258622314977, 0.2499663359085006, 0.27308770558748763, 0.29037177155851873, 0.285795757015524, 0.2919010226203595] 

Smallest Brier Loss Over All Folds and Choices of C
0.23079195590981844 

Most Informative Features
[359 563 164 277 264 569 591  11  49  13] 

Best Score Across Every Class
[0.00599484] 



In [378]:
#Run Logistic Regression with Built In Cross Validation to Select the Best Regularization Weight / Coefficients with respect to Brier Loss Metric

clf_layoff_l1 = logistic_regression('layoff','l1','saga')


In [379]:
print_logistic(clf_layoff_l1,'layoff','l1')

Logistic Regression with l1 regularization on layoff 

Regularization Constants
[1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04] 

Average Brier Loss per Regularization Constant Over All Folds
[0.24617614138501256, 0.23902809174893483, 0.24131248843359945, 0.24504392998291738, 0.2354979974570491, 0.24417006931989832, 0.25217996806793974, 0.2577167313870195, 0.2610373220343355, 0.26377841318692874] 

Smallest Brier Loss Over All Folds and Choices of C
0.2354979974570491 

Most Informative Features
[283 284 285 286 287 288 289 290 292 854] 

Best Score Across Every Class
[0.00599484] 



In [None]:
#clf = linear_model.LinearRegression()
# clf = linear_model.ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],cv=5,max_iter=1000)
# clf.fit(cont_data,gpa_label)
# prediction = clf.predict(cont_data)
# print(prediction[0:10])
# print("Mean Squared Error:{}".format(metrics.mean_squared_error(gpa_label, prediction)))
# print("Chosen Alpha:{}".format(cld.alpha_))
# print("Chosen l1 Ratio:{}".format(clf.l1_ratio_))

In [133]:
#NOTE THAT WE SET FIT_INTERCEPT TO TRUE EVEN THOUGH WE ARE USING ONE HOT ENCODING WITH COLINEARITY 
#BECUASE REGULARIZATION DEALS WITH THIS PROLBLEM

#ANOTHER IMPORTANT PARAMETER IS ALPHA - THIS IS THE AMOUNT OF PENALIZATION CHOSEN BY CV
#L!_RATIO IS THE COMPROMIZE BETWEEN L1 and L2 PENALIZATION CHOSEN BY CV
#SELECTION='random' ALLOWS RANDOM COEFF TO BE UPDATED AT EACH STEP, YIELDS SIGNIFICANT CONVERGENCE SPEED UP

scorer_ = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)

def linear_regression(label, regularizer, selection_):
    clf = linear_model.ElasticNetCV(l1_ratio=regularizer, eps=0.001, n_alphas=100, alphas=None, 
                                    fit_intercept=True, normalize=False, precompute='auto', 
                                    max_iter=1000, tol=0.0001, cv=5, copy_X=True, verbose=0, 
                                    n_jobs=1, positive=False, random_state=None, selection=selection_)
    clf.fit(train_data,train_labels[label])
    return clf

def linear_regression_post(label, regularizer, selection_, alphas_):
    clf = linear_model.ElasticNetCV(l1_ratio=regularizer, eps=0.001, alphas=alphas_, 
                                    fit_intercept=True, normalize=False, precompute='auto', 
                                    max_iter=1000, tol=0.0001, cv=5, copy_X=True, verbose=0, 
                                    n_jobs=1, positive=False, random_state=None, selection=selection_)
    clf.fit(train_data,train_labels[label])
    return clf

In [138]:
def print_linear(clf, label, selection):
    print("Linear Regression with Elastic Regularization and '{}' Selection on {}\n".format(selection, label))
    print "Optimal L1_Ratio Chosen (higher means more L1)"
    print clf.l1_ratio_, "\n"
    print "Optimal Alpha Chosen"
    print clf.alpha_, "\n"
    ##print "Grid of Alphas For Each L1 Ratio"
    ##print clf.alphas_, "\n"
    #print "Smallest MSE With Respect to each L1 Ratio"
    #print [np.min(x) for x in clf.mse_path_], "\n"
    #print "Smallest MSE With Respect to Each Fold"
    #print [np.min(clf.mse_path_[:,:,i]) for i in range(0,5)], "\n"
    print "Smallest MSE Overall"
    print np.min(clf.mse_path_), "\n"
    print "Largest MSE Overall"
    print np.max(clf.mse_path_), "\n"
    #print clf_job_l2.scores_
    print "Most Informative Features"
    informative = np.argsort(clf.coef_)
    print informative[-10:], "\n"

In [142]:
#clf_gpa_linear = linear_regression('gpa',[.1, .5, .7, .9, .95, .99, 1],'random')
clf_gpa_linear = linear_regression_post('gpa',1.0,'random',[0.0077890781657518095])

In [143]:
print(clf_gpa_linear.mse_path_.shape)
print(clf_gpa_linear.coef_.shape)

(5,)
(855,)


In [144]:
print_linear(clf_gpa_linear,'gpa','random')

Linear Regression with Elastic Regularization and 'random' Selection on gpa

Optimal L1_Ratio Chosen (higher means more L1)
1.0 

Optimal Alpha Chosen
0.0077890781657518095 

Smallest MSE Overall
0.26918283592103476 

Largest MSE Overall
0.38423223422546465 

Most Informative Features
[851 430 534  34 154  16 682 657 647 816] 



In [145]:
#clf_grit_linear = linear_regression('grit',[.1, .5, .7, .9, .95, .99, 1],'random')
clf_grit_linear = linear_regression_post('grit',0.1,'random',[0.07281736504717284])

In [146]:
print_linear(clf_grit_linear,'grit','random')

Linear Regression with Elastic Regularization and 'random' Selection on grit

Optimal L1_Ratio Chosen (higher means more L1)
0.1 

Optimal Alpha Chosen
0.07281736504717284 

Smallest MSE Overall
0.21311509504225082 

Largest MSE Overall
0.22897403014620205 

Most Informative Features
[410 233 383  55 230 154 254  49  33 593] 



In [147]:
#clf_hard_linear = linear_regression('materialHardship',[.1, .5, .7, .9, .95, .99, 1],'random')
clf_hard_linear = linear_regression_post('materialHardship',0.1,'random',[0.01951356598623485])

In [148]:
print_linear(clf_hard_linear,'materialHardship','random')

Linear Regression with Elastic Regularization and 'random' Selection on materialHardship

Optimal L1_Ratio Chosen (higher means more L1)
0.1 

Optimal Alpha Chosen
0.01951356598623485 

Smallest MSE Overall
0.019327990102053348 

Largest MSE Overall
0.025832091762808106 

Most Informative Features
[ 81 255  45 416 673 844 384 594  13 406] 



In [None]:
# prediction_csv = pd.read_csv(pred_path, low_memory=False)
# prediction_proba = clf.predict_proba(test_data)
# prediction = clf.predict(test_data)

# #data= pd.DataFrame({'jobTraining': prediction_proba[:,1]})
# prediction_csv.drop(['jobTraining'], axis=1, inplace=True)
# prediction_csv['jobTraining'] = pd.Series(prediction_proba[:,1], index=prediction_csv.index)

# prediction_csv.to_csv("output_prediction.csv", index=False)


In [465]:
job_prediction = clf_job_l2.predict_proba(test_data)
evict_prediction = clf_evict_l2.predict_proba(test_data)
layoff_prediction = clf_layoff_l2.predict_proba(test_data)
gpa_prediction = clf_gpa_linear.predict(test_data)
grit_prediction = clf_grit_linear.predict(test_data)
hard_prediction = clf_hard_linear.predict(test_data)

In [420]:
d = {'challengeID' : [i for i in range(1,4243)], 'gpa' : gpa_prediction, 'grit': grit_prediction, 
     'materialHardship': hard_prediction, 'eviction': evict_prediction[:,1], 'layoff': layoff_prediction[:,1], 
     'jobTraining': job_prediction[:,1]}
output_prediction = pd.DataFrame(data=d)



In [421]:
output_prediction.to_csv("output_prediction.csv", index=False)

In [65]:
#scorer_ = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)

#scorer_ = metrics.make_scorer(feedToBrier, greater_is_better=False,needs_proba=True)

scorer_ = metrics.make_scorer(metrics.accuracy_score)

def get_SVC(C_,kernel_): #C=1.0, kernel='rbf'
    return svm.SVC(C=C_, kernel=kernel_, degree=3, gamma='auto', coef0=0.0, 
                    shrinking=True, probability=False, tol=0.001, cache_size=500, 
                    class_weight='balanced', verbose=False, max_iter=-1, 
                    decision_function_shape='ovr', random_state=None)

def gridsearchSVC(label):
    params = {"kernel":[ "linear", "poly", "rbf", "sigmoid"], 
           "C":[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    grid = GridSearchCV(get_SVC(1.0,'rbf'), params, cv=5, scoring=scorer_, refit=True, iid=True)
    grid.fit(train_data, train_labels[label])
    return grid
    
def print_Grid_SVC(grid, label):
    print "GridSearchCV with SVC for {}".format(label)
    #print "CV Results" 
    #print grid.cv_results_, "\n"
    print "Best Score"
    print grid.best_score_, "\n"
    print "Best Params"
    print grid.best_params_, "\n"
    print "Best Estimator Index"
    print grid.best_index_, "\n"
    
def print_SVC(clf,label):
    print "SVC for {} binary classification".format(label)
    print "Number of support vectors for zero label: {}".format(clf.n_support_[0])
    print "Number of support vectors for one label: {}".format(clf.n_support_[1])
    print "Coefficients of support vectors in decision function"
    print clf.dual_coef_


In [83]:
#clf_job_svc = gridsearchSVC('jobTraining')    
clf_job_svc = get_SVC(300,"poly") 
clf_job_svc.fit(train_data,train_labels['jobTraining'])
print_SVC(clf_job_svc, 'jobTraining')

SVC for jobTraining binary classification
Number of support vectors for zero label: 834
Number of support vectors for one label: 327
Coefficients of support vectors in decision function
[[-195.81478183  -76.77761956 -195.81478183 ...  335.24187236
   222.50934884  513.03181056]]


In [84]:
prediction = clf_job_svc.predict(train_data)
metrics.accuracy_score(train_labels['jobTraining'], prediction)

0.9583901773533424

In [85]:
#For SVM, class probabilities computed using Platt are not theoretically sound
#We shall extrapolate our own probabilities from the output of decision function (distance to hyperplane)

def get_SVM_Proba(data,clf):
    distances = clf.decision_function(data)
    minimum = np.min(distances)
    maximum = np.max(distances)
    svc_proba = [-1.0*x/minimum if x < 0 else x/maximum for x in distances]
    svc_proba = [0.5 + x/2.0 for x in svc_proba]
    return distances, svc_proba

In [86]:
distances, job_svc_proba = get_SVM_Proba(train_data,clf_job_svc)
print np.max(job_svc_proba)
print np.min(job_svc_proba)

1.0
0.0


In [87]:
print train_labels['jobTraining'][0:6]
print prediction[0:6]
print distances[0:6]
print job_svc_proba[0:6]

[0. 0. 0. 0. 1. 0.]
[1. 0. 0. 0. 1. 0.]
[ 0.38566829 -0.99972379 -0.26075574 -1.00046017  1.0002794  -0.19735857]
[0.5894167170613608, 0.33982926549280323, 0.45822302190286124, 0.3397112862793016, 0.7319135441211728, 0.46838019855187374]


In [88]:
print("Brier Loss Score: {}".format(metrics.brier_score_loss(train_labels['jobTraining'], job_svc_proba)))

Brier Loss Score: 0.117331850705


In [89]:
# The above value of C makes our SVM fit our model extremely closely.  
#I don't think that is good considering we don't consider the distribution of test data or labels
#could be somewhat different.  Also I seem not to be able to control whether L1/L2 loss is used, 
#but these are important in that they determine whether we should scale our C with the size of our data.
#I'm just going to use value of C = 10 and kernel = poly to compute the remaining 

In [90]:
#Let's keep C parameter high (not worried about generalization because I'm pretty sure distribution of)
#held-out binary labels also dominated by 0
clf_evict_svc = get_SVC(300,"poly") 
clf_evict_svc.fit(train_data,train_labels['eviction'])

clf_layoff_svc = get_SVC(300,"poly")
clf_layoff_svc.fit(train_data,train_labels['layoff'])

SVC(C=300, cache_size=500, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [91]:
distances, evict_prediction = get_SVM_Proba(test_data, clf_evict_svc)
distances, layoff_prediction = get_SVM_Proba(test_data, clf_layoff_svc)
distances, job_prediction = get_SVM_Proba(test_data, clf_job_svc)

In [92]:
distances, evict_prob = get_SVM_Proba(train_data, clf_evict_svc)
labels = train_labels['eviction']
prediction = clf_evict_svc.predict(train_data)
print labels[0:6]
print prediction[0:6]
print distances[0:6]
print evict_prob[0:6]

[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[-1.29108671 -1.30278921 -1.0886882  -1.69482886 -1.99947194 -0.3801412 ]
[0.3132862261085039, 0.3115938393184857, 0.3425565993869535, 0.2548980314478593, 0.21084130126676676, 0.44502491802773947]


In [None]:
#Let's do SVR for continuous output labels now

In [96]:
scorer_ = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)

#scorer_ = metrics.make_scorer(feedToBrier, greater_is_better=False,needs_proba=True)

#scorer_ = metrics.make_scorer(metrics.accuracy_score)

def get_SVR(C_,kernel_): #C=1.0, kernel='rbf'
    return svm.SVR(kernel=kernel_, degree=3, gamma='auto', coef0=0.0, tol=0.001, C=C_, 
                   epsilon=0.1, shrinking=True, cache_size=500, verbose=False, max_iter=-1)

def gridsearchSVR(label):
    params = {"kernel":[ "linear", "poly", "rbf", "sigmoid"], 
           "C":[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    grid = GridSearchCV(get_SVR(1.0,'rbf'), params, cv=5, scoring=scorer_, refit=True, iid=True)
    grid.fit(train_data, train_labels[label])
    return grid
    
def print_Grid_SVR(grid, label):
    print "Support Vector Machine Regression for {}".format(label)
    #print "CV Results" 
    #print grid.cv_results_, "\n"
    print "Best Score"
    print grid.best_score_, "\n"
    print "Best Params"
    print grid.best_params_, "\n"
    print "Best Estimator Index"
    print grid.best_index_, "\n"

def print_SVR(clf,label):
    print "SVR for regression on {}".format(label)
    print "Number of support vectors"
    print clf.n_support_, "\n"
    print "Coefficients of support vectors in decision function"
    print clf.dual_coef_, "\n"

In [106]:
#clf_gpa_svr = gridsearchSVR('gpa') 
clf_gpa_svr = get_SVR(300,'poly') 
clf_gpa_svr.fit(train_data,train_labels['gpa'])
print_SVR(clf_gpa_svr, 'gpa')

SVR for regression on gpa
Number of support vectors
[739489952         1] 

Coefficients of support vectors in decision function
[[ -17.58147095 -163.33821114 -300.         ...  136.85381364
   -62.09595489   96.59925161]] 



In [107]:
#sanity check, we should have very low MSE on "train data"
prediction = clf_gpa_svr.predict(train_data)
metrics.mean_squared_error(train_labels['gpa'], prediction)

0.03370794418779471

In [108]:
#Collect SVR's for the two other continuous label types
clf_grit_svr = get_SVR(300,"poly") 
clf_grit_svr.fit(train_data,train_labels['grit'])

clf_hard_svr = get_SVR(300,"poly")
clf_hard_svr.fit(train_data,train_labels['materialHardship'])

SVR(C=300, cache_size=500, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [109]:
gpa_prediction = clf_gpa_svr.predict(test_data)
grit_prediction = clf_grit_svr.predict(test_data)
hard_prediction = clf_hard_svr.predict(test_data)

In [149]:
#Take the best results for Continous Labels (Linear Regression)
#Combine with best results for Binary Labels (SVM - SVC)
#Write to output_prediction.csv

distances, evict_prediction = get_SVM_Proba(test_data, clf_evict_svc)
distances, layoff_prediction = get_SVM_Proba(test_data, clf_layoff_svc)
distances, job_prediction = get_SVM_Proba(test_data, clf_job_svc)
gpa_prediction = clf_gpa_linear.predict(test_data)
grit_prediction = clf_grit_linear.predict(test_data)
hard_prediction = clf_hard_linear.predict(test_data)

In [150]:
a = pd.DataFrame({'challengeID' : [i for i in range(1,4243)]}) 
b = pd.DataFrame({'gpa' : gpa_prediction}) 
c = pd.DataFrame({'grit': grit_prediction}) 
d = pd.DataFrame({'materialHardship': hard_prediction}) 
e = pd.DataFrame({'eviction': evict_prediction}) 
f = pd.DataFrame({'layoff': layoff_prediction}) 
g = pd.DataFrame({'jobTraining': job_prediction})

a = pd.concat([a, b], axis=1)
a = pd.concat([a, c], axis=1)
a = pd.concat([a, d], axis=1)
a = pd.concat([a, e], axis=1)
a = pd.concat([a, f], axis=1)
a = pd.concat([a, g], axis=1)

a.to_csv("output_prediction.csv", index=False)

In [111]:
#Most Important Features From Regression Techniques

log_l2_job = [849, 416, 576, 81, 49, 852, 343, 657, 13, 844]
log_l1_job = [445, 176, 841, 160, 343, 852, 13, 814, 844, 797] 

log_l2_evict = [722, 407, 637, 764, 674, 804, 805, 329, 461, 476]
log_l1_evict = [569, 382, 743, 13, 264, 65, 845, 176, 637, 686] 

log_l2_layoff = [359, 563, 164, 277, 264, 569, 591, 11, 49, 13]
log_l1_layoff = [283, 284, 285, 286, 287, 288, 289, 290, 292, 854]

lin_l1_gpa = [556, 851, 430, 534, 154, 16, 682, 657, 647, 816]
lin_l2_grit = [410, 233, 383, 55, 230, 154, 254, 49, 33, 593]
lin_l2_hard = [81, 255, 45, 416, 673, 844, 384, 594, 13, 406]


In [114]:
#Convert indexes to actual var name

all_columns = list(uc_columns)

def extract_vars(indexes):
    var_names = []
    for i in indexes:
        if i <= 641:
            var_names.append(uc_columns[indexToVar[i]])
        else:
            var_names.append(other_columns[i-642])
    return var_names
        
log_l2_job_vars = extract_vars(log_l2_job)
log_l1_job_vars = extract_vars(log_l1_job)
log_l2_evict_vars = extract_vars(log_l2_evict)
log_l1_evict_vars = extract_vars(log_l1_evict)
log_l2_layoff_vars = extract_vars(log_l2_layoff)
log_l1_layoff_vars = extract_vars(log_l1_layoff)
lin_l1_gpa_vars = extract_vars(lin_l1_gpa)
lin_l2_grit_vars = extract_vars(lin_l2_grit)
lin_l2_hard_vars = extract_vars(lin_l2_hard)


In [119]:
def sort_by_frequency(vars_list):
    temp_dict = {}
    for var_list in vars_list:
        for var in var_list:
            if var in temp_dict:
                temp_dict[var] = temp_dict[var] + 1
            else:
                temp_dict[var] = 1
                
    temp_list = []

    for var in temp_dict:
        temp_list.append((var,temp_dict[var]))

    temp_list = sorted(temp_list, key=lambda x: x[1])
    return temp_list

log_job_vars = sort_by_frequency([log_l2_job_vars, log_l1_job_vars])
log_evict_vars = sort_by_frequency([log_l2_evict_vars, log_l1_evict_vars])
log_layoff_vars = sort_by_frequency([log_l2_layoff_vars, log_l1_layoff_vars])
lin_gpa_vars = lin_l1_gpa_vars
lin_grit_vars = lin_l2_grit_vars
lin_hard_vars = lin_l2_hard_vars

In [121]:
print "Most Important Features for Job Training"
print log_job_vars, "\n"
print "Most Important Features for Eviction"
print log_evict_vars, "\n"
print "Most Important Features for Layoff"
print log_layoff_vars, "\n"
print "Most Important Features for GPA"
print lin_gpa_vars, "\n"
print "Most Important Features for Grit"
print lin_grit_vars, "\n"
print "Most Important Features for Material Hardship"
print lin_hard_vars

Most Important Features for Job Training
[('ct4kyear', 1), ('cm2hhimp', 1), ('cm1edu', 1), ('cm1ethrace', 1), ('cm5b_ageyrs', 1), ('ch4haz', 1), ('cf2cohm', 1), ('cm5povco', 1), ('cm5marp', 1), ('ch4htwt', 1), ('ch5dsss', 1), ('cf3samp', 1), ('cf4cohp', 2), ('cf1ethrace', 2), ('cm5edu', 2), ('cm5povca', 2)] 

Most Important Features for Eviction
[('ch3waz', 1), ('cm4hhimp', 1), ('cm4fdiff', 1), ('cm3span', 1), ('cm5relf', 1), ('ch3att_v', 1), ('ch4mwtkg', 1), ('ch4selfht', 1), ('cm2amrf', 1), ('cf1ethrace', 1), ('cf4samp', 1), ('cm5bmomstat', 1), ('cm4kids', 1), ('cf5edu', 1), ('cf2mint', 1), ('cf3kids', 1), ('cf3samp', 1), ('ch4mwtlb', 1), ('cf5samp', 2)] 

Most Important Features for Layoff
[('cm3span', 1), ('cm5md_case_con', 1), ('cm1ethrace', 1), ('cm2span', 1), ('cf1gmom', 1), ('ch3ovscale', 1), ('cm5span', 1), ('cm5relf', 1), ('cf1ethrace', 1), ('ch3mompreg', 1), ('cf5povcab', 1), ('cf4md_case_lib', 1), ('ch3mesyr', 1), ('ch3cflag', 2), ('ch3cwtalone', 2), ('ch3mflag', 3)] 

Most

In [126]:
meta_csv.loc[meta_csv.loc[:,'new_name'] == 'cm5md_case_lib','topic1'].iloc[0]

'mental health'

In [130]:
def get_topics(var_list, _tuple):
    temp_dict = {}
    if _tuple:
        for var in var_list:
            temp_dict[meta_csv.loc[meta_csv.loc[:,'new_name'] == var[0],'topic1'].iloc[0]] = True 
    else:
        for var in var_list:
            temp_dict[meta_csv.loc[meta_csv.loc[:,'new_name'] == var,'topic1'].iloc[0]] = True 

    return temp_dict

log_job_topics = get_topics(log_job_vars, True)
log_evict_topics = get_topics(log_evict_vars, True)
log_layoff_topics = get_topics(log_layoff_vars, True)
lin_gpa_topics = get_topics(lin_gpa_vars, False)
lin_grit_topics = get_topics(lin_grit_vars, False)
lin_hard_topics = get_topics(lin_hard_vars, False)

In [131]:
print "Most Important Feature Topics for Job Training"
print log_job_topics.keys(), "\n"
print "Most Important Feature Topics for Eviction"
print log_evict_topics.keys(), "\n"
print "Most Important Feature Topics for Layoff"
print log_layoff_topics.keys(), "\n"
print "Most Important Feature Topics for GPA"
print lin_gpa_topics.keys(), "\n"
print "Most Important Feature Topics for Grit"
print lin_grit_topics.keys(), "\n"
print "Most Important Feature Topics for Material Hardship"
print lin_hard_topics.keys()

Most Important Feature Topics for Job Training
['cognitive skills', 'height and weight', 'educational attainment/achievement', 'parental relationship status', 'age', 'race/ethnicity', 'new partner relationship status', 'household income/poverty', 'paradata'] 

Most Important Feature Topics for Eviction
['height and weight', 'educational attainment/achievement', 'parental relationship status', 'age', 'household composition', 'race/ethnicity', 'behavior', 'household income/poverty', 'paradata'] 

Most Important Feature Topics for Layoff
['mental health', 'height and weight', 'parental relationship status', 'paradata', 'household composition', 'race/ethnicity', 'fertility history', 'household income/poverty'] 

Most Important Feature Topics for GPA
['cognitive skills', 'height and weight', 'educational attainment/achievement', 'paradata', 'mental health', 'household income/poverty'] 

Most Important Feature Topics for Grit
['mental health', 'sex/gender', 'parental relationship status', 'p