In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import seaborn as sns
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


In [3]:
Test = pd.read_csv('Test.csv')# load in data set
TrainingInputs = pd.read_csv('TrainingInputs.csv')
TrainingLabels = pd.read_csv('TrainingLabels.csv')
TrainingCombined = pd.concat([TrainingInputs, TrainingLabels], axis =1)

In [4]:
print(TrainingInputs.describe()) #summary statistics 

              row_id      loan_type  property_type   loan_purpose  \
count  500000.000000  500000.000000  500000.000000  500000.000000   
mean   249999.500000       1.366276       1.047650       2.066810   
std    144337.711634       0.690555       0.231404       0.948371   
min         0.000000       1.000000       1.000000       1.000000   
25%    124999.750000       1.000000       1.000000       1.000000   
50%    249999.500000       1.000000       1.000000       2.000000   
75%    374999.250000       2.000000       1.000000       3.000000   
max    499999.000000       4.000000       3.000000       3.000000   

           occupancy    loan_amount    preapproval         msa_md  \
count  500000.000000  500000.000000  500000.000000  500000.000000   
mean        1.109590     221.753158       2.764722     181.606972   
std         0.326092     590.641648       0.543061     138.464169   
min         1.000000       1.000000       1.000000      -1.000000   
25%         1.000000      93.0000

In [5]:
le = preprocessing.LabelEncoder()
le.fit(Test['co_applicant'])
Test['co_applicant'] = le.transform(Test['co_applicant'])
le.fit(TrainingInputs['co_applicant'])
TrainingInputs['co_applicant'] = le.transform(TrainingInputs['co_applicant'])
le.fit(TrainingCombined['co_applicant'])
TrainingCombined['co_applicant'] = le.transform(TrainingCombined['co_applicant'])

In [6]:
TrainingInputs['msa_md'] = TrainingInputs['msa_md'].replace(-1, np.NaN)
TrainingCombined['msa_md'] = TrainingInputs['msa_md'].replace(-1, np.NaN)
Test['msa_md'] = Test['msa_md'].replace(-1, np.NaN)

In [7]:
TrainingInputs['county_code'] = TrainingInputs['county_code'].replace(-1, np.NaN)
TrainingCombined['county_code'] = TrainingInputs['county_code'].replace(-1, np.NaN)
Test['county_code'] = Test['county_code'].replace(-1, np.NaN)

In [8]:
TrainingInputs['state_code'] = TrainingInputs['state_code'].replace(-1, np.NaN)
TrainingCombined['state_code'] = TrainingInputs['state_code'].replace(-1, np.NaN)
Test['state_code'] = Test['state_code'].replace(-1, np.NaN)

In [9]:
TrainingInputs['applicant_income'] = TrainingInputs['applicant_income'].fillna(TrainingInputs['applicant_income'].median())#fill missing values of app income with median rather than mean

In [10]:
TrainingInputs['log_applicant_income'] = np.log(TrainingInputs['applicant_income']) #log transformation of loan amount and applicant income
TrainingCombined['log_applicant_income'] = np.log(TrainingInputs['applicant_income'])
Test['log_applicant_income'] = np.log(Test['applicant_income'])
TrainingInputs['log_loan_amount'] = np.log(TrainingInputs['loan_amount'])
TrainingCombined['log_loan_amount'] = np.log(TrainingCombined['loan_amount'])
Test['log_loan_amount'] = np.log(Test['loan_amount'])

In [12]:
def plot_box(TrainingCombined, cols, col_x = 'accepted'):
    for col in cols:
        sns.set_style("whitegrid")
        sns.boxplot(col_x, col, data=TrainingCombined)
        plt.xlabel(col_x) # Set text for the x axis
        plt.ylabel(col)# Set text for y axis
        plt.show()

num_cols = ['loan_amount', 'applicant_income','log_loan_amount', 'log_applicant_income',
            'population', 'minority_population_pct', 'ffiecmedian_family_income', 'tract_to_msa_md_income_pct', 'number_of_owner-occupied_units', 'number_of_1_to_4_family_units']
plot_box(TrainingCombined, num_cols)

TypeError: cannot label index with a null key

In [20]:
state_rates = TrainingCombined[['state_code', 'accepted']].groupby('state_code').mean()
state_rates.columns = ['state_acceptance_rate']

state_rates_dictionary = state_rates.to_dict()
print(state_rates_dictionary)

{'state_acceptance_rate': {0.0: 0.48776290630975144, 1.0: 0.6022052586938084, 2.0: 0.4567042972715248, 3.0: 0.49708104395604397, 4.0: 0.5753616425571628, 5.0: 0.4370895718413449, 6.0: 0.48971895019632156, 7.0: 0.6230314960629921, 8.0: 0.4734322642412638, 9.0: 0.48201043669321614, 10.0: 0.49891939994914825, 11.0: 0.5212765957446809, 12.0: 0.5845471817606079, 13.0: 0.524022346368715, 14.0: 0.5157615633899637, 15.0: 0.5746219592373438, 16.0: 0.48081571933304407, 17.0: 0.5761802575107297, 18.0: 0.4507945358238082, 19.0: 0.5658362989323843, 20.0: 0.5359650787625735, 21.0: 0.5679403120054262, 22.0: 0.48792629892423894, 23.0: 0.5877513711151737, 24.0: 0.49662487945998074, 25.0: 0.4912040778268167, 26.0: 0.5333649160801299, 27.0: 0.5511022044088176, 28.0: 0.5600298284862043, 29.0: 0.5782258064516129, 30.0: 0.6046511627906976, 31.0: 0.47700442695523854, 32.0: 0.5316898990592598, 33.0: 0.5430793157076206, 34.0: 0.49696151249155973, 35.0: 0.5976665412118931, 36.0: 0.4865125240847784, 37.0: 0.5432

In [21]:
TrainingCombined['state_acceptance_rate'] = TrainingCombined['state_code'].map(state_rates_dictionary['state_acceptance_rate'])
TrainingCombined['state_acceptance_rate'] = TrainingCombined['state_acceptance_rate'].fillna(TrainingCombined['state_acceptance_rate'].mean())

TrainingInputs['state_acceptance_rate'] = TrainingInputs['state_code'].map(state_rates_dictionary['state_acceptance_rate'])
TrainingInputs['state_acceptance_rate'] = TrainingInputs['state_acceptance_rate'].fillna(TrainingInputs['state_acceptance_rate'].mean())

Test['state_acceptance_rate'] = Test['state_code'].map(state_rates_dictionary['state_acceptance_rate'])
Test['state_acceptance_rate'] = Test['state_acceptance_rate'].fillna(Test['state_acceptance_rate'].mean())
#logs_df['EmployeeName'] = logs_df.EmployeeID.map(employees)

In [22]:
TrainingInputs.to_csv('StatesInputs.csv')
Test.to_csv('StatesTest.csv')

In [23]:
Lender_rates = TrainingCombined[['lender', 'accepted']].groupby('lender').mean()
lender_rates_dictionary = Lender_rates.to_dict()
print(lender_rates_dictionary)

{'accepted': {0: 1.0, 1: 1.0, 2: 1.0, 4: 0.5, 5: 0.3333333333333333, 6: 0.3333333333333333, 7: 1.0, 8: 1.0, 9: 0.47058823529411764, 10: 0.25, 11: 0.23529411764705882, 12: 0.8, 13: 0.8, 14: 1.0, 15: 1.0, 17: 0.0, 18: 0.5, 19: 0.8636363636363636, 20: 0.16666666666666666, 21: 0.6, 23: 1.0, 24: 0.3333333333333333, 25: 0.0, 26: 1.0, 29: 0.5, 30: 0.8214285714285714, 31: 0.4605263157894737, 32: 1.0, 33: 0.06806282722513089, 34: 0.4, 35: 0.625, 36: 0.2857142857142857, 37: 0.4423076923076923, 38: 0.5, 39: 0.7222222222222222, 40: 0.3611111111111111, 41: 0.8, 42: 0.8888888888888888, 45: 1.0, 46: 0.3870967741935484, 47: 0.3333333333333333, 48: 0.5789473684210527, 49: 0.06666666666666667, 50: 0.6666666666666666, 51: 0.7, 52: 0.0, 53: 1.0, 54: 0.5833333333333334, 55: 1.0, 56: 0.6111111111111112, 57: 0.8829787234042553, 58: 1.0, 59: 0.5, 60: 0.07142857142857142, 61: 0.6666666666666666, 62: 1.0, 63: 0.8416666666666667, 64: 0.5, 65: 0.8205128205128205, 66: 0.5, 67: 0.6666666666666666, 68: 1.0, 69: 0.0,

In [24]:
TrainingInputs['lender_rate'] = TrainingInputs['lender'].map(lender_rates_dictionary['accepted'])
Test['lender_rate'] = Test['lender'].map(lender_rates_dictionary['accepted'])
Test['lender_rate'] = Test['lender_rate'].fillna(Test['lender_rate'].mean())


In [25]:
LoanPurpose_rates = TrainingCombined[['loan_purpose', 'accepted']].groupby('loan_purpose').mean()
loanpurpose_rates_dictionary = LoanPurpose_rates.to_dict()
print(loanpurpose_rates_dictionary)

{'accepted': {1: 0.594418560198791, 2: 0.32917923645063973, 3: 0.45288251535006385}}


In [26]:
TrainingInputs['purpose_acceptance_rate'] = TrainingInputs['loan_purpose'].map(loanpurpose_rates_dictionary['accepted'])
Test['purpose_acceptance_rate'] = Test['loan_purpose'].map(loanpurpose_rates_dictionary['accepted'])
print(Test[['loan_purpose', 'purpose_acceptance_rate']].head())

   loan_purpose  purpose_acceptance_rate
0             3                 0.452883
1             1                 0.594419
2             1                 0.594419
3             1                 0.594419
4             1                 0.594419


In [27]:
LoanType_rates = TrainingCombined[['loan_type', 'accepted']].groupby('loan_type').mean()
loantype_rates_dictionary = LoanType_rates.to_dict()
print(loantype_rates_dictionary)

{'accepted': {1: 0.4959482454082362, 2: 0.5051316268348902, 3: 0.5190405101085499, 4: 0.561063417335918}}


In [28]:
TrainingInputs['loan_type_acceptance_rate'] = TrainingInputs['loan_type'].map(loantype_rates_dictionary['accepted'])
Test['loan_type_acceptance_rate'] = Test['loan_type'].map(loantype_rates_dictionary['accepted'])
print(Test[['loan_type', 'loan_type_acceptance_rate']].head())

   loan_type  loan_type_acceptance_rate
0          2                   0.505132
1          1                   0.495948
2          1                   0.495948
3          2                   0.505132
4          2                   0.505132


In [29]:
PropertyType_rates = TrainingCombined[['property_type', 'accepted']].groupby('property_type').mean()
propertytype_rates_dictionary = PropertyType_rates.to_dict()
print(propertytype_rates_dictionary)

{'accepted': {1: 0.5107660329933901, 2: 0.2258750823159921, 3: 0.6846229187071499}}


In [30]:
TrainingInputs['property_type_acceptance_rate'] = TrainingInputs['property_type'].map(propertytype_rates_dictionary['accepted'])
Test['property_type_acceptance_rate'] = Test['property_type'].map(propertytype_rates_dictionary['accepted'])
print(Test[['property_type', 'property_type_acceptance_rate']].head())

   property_type  property_type_acceptance_rate
0              1                       0.510766
1              1                       0.510766
2              1                       0.510766
3              1                       0.510766
4              1                       0.510766


In [31]:
Gender_rates = TrainingCombined[['applicant_sex', 'accepted']].groupby('applicant_sex').mean()
gender_rates_dictionary = Gender_rates.to_dict()
print(gender_rates_dictionary)

{'accepted': {1: 0.520822910267696, 2: 0.46645342814748453, 3: 0.40605448475156414, 4: 0.7959471365638766}}


In [32]:
TrainingInputs['gender_acceptance_rate'] = TrainingInputs['applicant_sex'].map(gender_rates_dictionary['accepted'])
Test['gender_acceptance_rate'] = Test['applicant_sex'].map(gender_rates_dictionary['accepted'])
print(Test[['applicant_sex', 'gender_acceptance_rate']].head())

   applicant_sex  gender_acceptance_rate
0              1                0.520823
1              1                0.520823
2              2                0.466453
3              2                0.466453
4              3                0.406054


In [33]:
TrainingInputs.to_csv('InputRates.csv')
Test.to_csv('TestRates.csv')

In [34]:
Features= TrainingInputs[['log_applicant_income', 'loan_amount', 'state_acceptance_rate', 'lender_rate', 'property_type_acceptance_rate', 'loan_type_acceptance_rate', 'purpose_acceptance_rate']]

In [35]:
FeaturesLabel = pd.concat([Features, TrainingLabels], axis =1)
FeaturesLabel.to_csv('Final Features Correlation.csv')

In [36]:
TrainingFeaturesArray = np.array(Features)
TrainingLabelsArray = np.array(TrainingCombined[['accepted']])
TestFeatures = np.array(Test[['log_applicant_income', 'loan_amount', 'state_acceptance_rate', 'lender_rate', 'property_type_acceptance_rate', 'loan_type_acceptance_rate', 'purpose_acceptance_rate']])

In [37]:
X_train, X_test, y_train,  y_test = train_test_split(TrainingFeaturesArray, TrainingLabelsArray, test_size = .3, random_state =42)
print(X_train.shape)
print(y_train.shape)

(350000, 7)
(350000, 1)


In [38]:
clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(accuracy_score(pred, y_test))

  y = column_or_1d(y, warn=True)


0.6980733333333333


In [39]:
from sklearn.model_selection import KFold


def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold).split(x): 
        clf.fit(x[train], y[train]) 
        result += score_func(clf.predict(x[test]), y[test]) 
    return result / nfold

In [41]:
param_grid = [.01,.1,1,10,100]
scores = []

for i in param_grid:
    clf = linear_model.LogisticRegression(C=i)
    score = cv_score(clf, X_train, y_train, score_func=(accuracy_score))
    scores.append(score)
print ("Different scores using different C values: {}".format(scores))
print ("Maximum score: {}".format(max(scores)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Different scores using different C values: [0.6962314285714286, 0.6978485714285714, 0.6980428571428571, 0.6983571428571429, 0.6983257142857143]
Maximum score: 0.6983571428571429


In [42]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_predicted = rfc.predict(X_test)

  


In [43]:
print(accuracy_score(y_predicted, y_test))

0.6814466666666666
