In [14]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

data = pd.read_csv('cs-training.csv').drop('Unnamed: 0', axis=1)

# Remove rows win NaN values
data = data.dropna()

data.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [5]:
type(data)

pandas.core.frame.DataFrame

## Initial exploration

### Data dictionary

* **SeriousDlqin2yrs**: Person experienced 90 days past due delinquency or worse

* **RevolvingUtilizationOfUnsecuredLines**: Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits

* **age**: Age of borrower in years

* **NumberOfTime30-59DaysPastDueNotWorse**: Number of times borrower has been 30-59 days past due but no worse in the last 2 years.

* **DebtRatio**: Monthly debt payments, alimony,living costs divided by monthy gross income

* **MonthlyIncome**: Monthly income

* **NumberOfOpenCreditLinesAndLoans**: Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards)

* **NumberOfTimes90DaysLate**: Number of times borrower has been 90 days or more past due.

* **NumberRealEstateLoansOrLines**: Number of mortgage and real estate loans including home equity lines of credit

* **NumberOfTime60-89DaysPastDueNotWorse**: Number of times borrower has been 60-89 days past due but no worse in the last 2 years.

* **NumberOfDependents**: Number of dependents in family excluding themselves (spouse, children etc.)

In [6]:
# Calculate number of samples
n_samples = data.shape[0]

# Calculate number of features
n_features = data.shape[1] - 1

# Calculate positive cases
n_positive = data[data.SeriousDlqin2yrs == 1].shape[0]

# TODO: Calculate failing students
n_negative = data[data.SeriousDlqin2yrs == 0].shape[0]

# TODO: Calculate positive rate
positive_rate = 100 * (n_positive / float(n_samples))

# Print the results
print "Total number of samples: {}".format(n_samples)
print "Number of features: {}".format(n_features)
print "Number of positives: {}".format(n_positive)
print "Number of negatives: {}".format(n_negative)
print "Positive rate: {:.2f}%".format(positive_rate)

Total number of samples: 120269
Number of features: 10
Number of positives: 8357
Number of negatives: 111912
Positive rate: 6.95%


In [7]:
y_all = data['SeriousDlqin2yrs']
X_all = data.drop('SeriousDlqin2yrs', axis=1)

print
print type(X_all)
print
print type(y_all)

# # Extract feature columns
# feature_cols = list(data.columns[1:])
# 
# # Extract target column 'SeriousDlqin2yrs'
# target_col = data.columns[0]
# 
# # Show the list of columns
# print "Feature columns:\n{}".format(feature_cols)
# print "\nTarget column: {}".format(target_col)
# 
# # Separate the data into feature data and target data (X_all and y_all, respectively)
# X_all = data[feature_cols]
# y_all = data[target_col]
# 
# print
# print type(X_all)
# print
# print type(y_all)


<class 'pandas.core.frame.DataFrame'>

<class 'pandas.core.series.Series'>


In [8]:
from sklearn.cross_validation import train_test_split

# TODO: Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.40, random_state=9)

# Success
print "Training and testing split was successful."

Training and testing split was successful.


In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# train set
clf = clf.fit(X_train, y_train)
print clf.score(X_train, y_train)

# test set
print clf.score(X_test, y_test)

0.990410332451
0.930177932984


In [42]:
real_data = pd.read_csv('cs-test.csv').drop('Unnamed: 0', axis=1)

real_X_all = real_data.drop('SeriousDlqin2yrs', axis=1)

print "Data loaded!"

Data loaded!


In [43]:
real_X_all

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0
4,1.000000,27,0,0.019917,3865.0,4,0,0,0,1.0
5,0.509791,63,0,0.342429,4140.0,4,0,0,0,1.0
6,0.587778,50,0,1048.000000,0.0,5,0,0,0,3.0
7,0.046149,79,1,0.369170,3301.0,8,0,1,0,1.0
8,0.013527,68,0,2024.000000,,4,0,1,0,0.0
9,1.000000,23,98,0.000000,0.0,0,98,0,98,0.0


In [44]:
pred = clf.predict(real_X_all)

print "Prediction done!"

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [40]:
df = pd.DataFrame(pred, columns=['Probability'])
df['Id'] = df.index + 1

df.head()

df.to_csv(
    'submission_random_forest.csv',
    index=False,
    columns=['Id', 'Probability']
)

print "Submission file saved!"

Submission file saved!


In [21]:
pred = pd.DataFrame()
# pred['Id'] = real_X_all['Id']
pred = clf.predict(real_X_all)

submission = pd.DataFrame({
    "Id": real_data.index,
    "Probability": pred
})

submission.head()

# pred.to_csv(
#     'submission_random_forest.csv',
#     index=True, index_label='Id'
# )

# pred['Survived'] = clf.predict(X_test)
# pred.to_csv('submission_random_forest.csv', index = False)
# 
# print 'Got 70.33% accuracy using Random Forest Classifier'

ValueError: arrays must all be same length