# Load & Imputation

In [8]:
"""
Load dataset
"""
import pandas as pd
import numpy as np
path_to_file = "./credit_r/train_na.csv"
data = pd.read_csv(path_to_file)
data = data.drop('Unnamed: 0',axis=1)
data.head()

Unnamed: 0,SeriousDlqin2yrs,age,MonthlyIncome,NumberOfDependents,RevolvingUtilizationOfUnsecuredLines,DebtRatio,NumberOfOpenCreditLinesAndLoans,NumberRealEstateLoansOrLines,NumberOfTime30.59DaysPastDueNotWorse,NumberOfTime60.89DaysPastDueNotWorse,NumberOfTimes90DaysLate
0,1,45.0,9120.0,2.0,0.766127,0.802982,13,6,2.0,0.0,0.0
1,0,40.0,2600.0,1.0,0.957151,0.121876,4,0,0.0,0.0,0.0
2,0,38.0,3042.0,0.0,0.65818,0.085113,2,0,1.0,0.0,1.0
3,0,30.0,3300.0,0.0,0.23381,0.03605,5,0,0.0,0.0,0.0
4,0,49.0,63588.0,0.0,0.907239,0.024926,7,1,1.0,0.0,0.0


In [9]:
#Count of NaN
(data.isnull().sum() / data.shape[0] *100).round(4)

SeriousDlqin2yrs                         0.0000
age                                      0.0007
MonthlyIncome                           19.8207
NumberOfDependents                       2.6160
RevolvingUtilizationOfUnsecuredLines     0.0000
DebtRatio                                0.0000
NumberOfOpenCreditLinesAndLoans          0.0000
NumberRealEstateLoansOrLines             0.0000
NumberOfTime30.59DaysPastDueNotWorse     0.1793
NumberOfTime60.89DaysPastDueNotWorse     0.1793
NumberOfTimes90DaysLate                  0.1793
dtype: float64

In [10]:
#IMPUTATION
#from sklearn.preprocessing import Imputer
data = data.fillna(data.mean())

#Count of NaN
data.isnull().sum()

SeriousDlqin2yrs                        0
age                                     0
MonthlyIncome                           0
NumberOfDependents                      0
RevolvingUtilizationOfUnsecuredLines    0
DebtRatio                               0
NumberOfOpenCreditLinesAndLoans         0
NumberRealEstateLoansOrLines            0
NumberOfTime30.59DaysPastDueNotWorse    0
NumberOfTime60.89DaysPastDueNotWorse    0
NumberOfTimes90DaysLate                 0
dtype: int64

In [11]:
#Missingness in monthly income 
1-data.groupby('SeriousDlqin2yrs')['MonthlyIncome'].count() / data.groupby('SeriousDlqin2yrs')['SeriousDlqin2yrs'].count()

SeriousDlqin2yrs
0    0.0
1    0.0
dtype: float64

# Split Prep

In [12]:
import sklearn.cross_validation as cv
X = data.iloc[:,1:]
y=data.iloc[:,0]
x_train, x_test, y_train, y_test = cv.train_test_split(X, 
                                                       y, 
                                                       train_size=0.7, 
                                                       random_state=0)

In [13]:
import sklearn.metrics as met

def get_error(x_train, y_train, x_test, y_test, model, show = True):
    model.fit(x_train, y_train)
    train_error = 1 - model.score(x_train, y_train)
    test_error = 1 - model.score(x_test, y_test)
    #train_auc = met.roc_auc_score(x_train, y_train)
    #test_auc = met.roc_auc_score(x_test, y_test)
    if show:
        print "The training error is: %.5f" %train_error 
        print "The test     error is: %.5f" %test_error
        #print "The training auc is: %.5f" %train_auc
        #print "The test     auc is: %.5f" %test_auc
    return [train_error, test_error]

# Logistic Regression

In [63]:
#Logistic regression
logit = lm.LogisticRegression(C=1e5)
get_error(x_train, y_train, x_test, y_test, logit)

The training error is: 0.06358
The test     error is: 0.06436


[0.063580952380952405, 0.06435555555555561]

In [57]:
#Logistic regression
logit = lm.LogisticRegressionCV(Cs=np.logspace(-5, 5, 100), cv=3)
get_error(x_train, y_train, x_test, y_test, logit)

The training error is: 0.06368
The test     error is: 0.06464


[0.063676190476190442, 0.064644444444444393]

# Logistic Regression in Spark

In [21]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
import pandas as pd

In [47]:
#Logistic regression in Spark, with CV

#Create Spark DF
input_index = map(str, range(10))
colnames = ['label'] + input_index
df = sqlContext.createDataFrame(data, colnames)
           
#Change label to 'factor'
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(df)
df = si_model.transform(df)

#Prep train and test Spark DF
(train_data, test_data) = df.randomSplit([0.7, 0.3])
assembler = VectorAssembler(inputCols=input_index, outputCol='features')
train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

#Cross-Validation
lr = LogisticRegression(labelCol="indexed")
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1, 2]).addGrid(lr.regParam, [0.01, 0.1, 1.0]).build()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(train_data)

#AUC
evaluator = BinaryClassificationEvaluator(labelCol="indexed", metricName="areaUnderROC")
print "Train AUC:", evaluator.evaluate(cvModel.transform(train_data))
print "Train AUC:", evaluator.evaluate(cvModel.transform(test_data))

Train AUC: 0.815289888306
Train AUC: 0.814704025907
