In [50]:
import pandas as pd
train_dataset = pd.read_csv("features/v1/learn.csv")
train_features = train_dataset.drop("loan_status", axis=1)
train_labels = train_dataset.loan_status

In [51]:
from imblearn.combine import SMOTETomek
smt = SMOTETomek(random_state=19, sampling_strategy="auto")
features_res, labels_res = smt.fit_resample(train_features, train_labels)

In [52]:
from  sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_res, labels_res, random_state=19)

In [53]:
from sklearn.ensemble import RandomForestClassifier
estimators = []
scores = []
classifiers = []
technique = []
for estimator in [3, 5, 15, 25, 30, 50, 150, 250, 300, 500]:
    rf = RandomForestClassifier(n_estimators= estimator, random_state = 19)
    rf.fit(x_train, y_train)
    estimators.append(estimator)
    scores.append(rf.score(x_test, y_test))
    classifiers.append(rf)
    technique.append("random forest")
rf_data = pd.DataFrame(data={"technique": technique, "classifier": classifiers, "score": scores, "estimator": estimators})
rf_data
    

Unnamed: 0,technique,classifier,score,estimator
0,random forest,"(DecisionTreeClassifier(max_features='auto', r...",0.833333,3
1,random forest,"(DecisionTreeClassifier(max_features='auto', r...",0.869565,5
2,random forest,"(DecisionTreeClassifier(max_features='auto', r...",0.898551,15
3,random forest,"(DecisionTreeClassifier(max_features='auto', r...",0.905797,25
4,random forest,"(DecisionTreeClassifier(max_features='auto', r...",0.92029,30
5,random forest,"(DecisionTreeClassifier(max_features='auto', r...",0.927536,50
6,random forest,"(DecisionTreeClassifier(max_features='auto', r...",0.913043,150
7,random forest,"(DecisionTreeClassifier(max_features='auto', r...",0.905797,250
8,random forest,"(DecisionTreeClassifier(max_features='auto', r...",0.905797,300
9,random forest,"(DecisionTreeClassifier(max_features='auto', r...",0.913043,500


In [54]:
from sklearn.neighbors import KNeighborsClassifier
neighbors = []
scores = []
classifiers = []
technique = []
for neighbor in range(1,10):
    kn = KNeighborsClassifier(n_neighbors=neighbor)
    kn.fit(x_train, y_train)
    neighbors.append(neighbor)
    scores.append(kn.score(x_test, y_test))
    classifiers.append(kn)
    technique.append("nearest neighbors")
kn_data = pd.DataFrame(data={"technique": technique, "classifier": classifiers, "neighbors": neighbors, "scores": scores})
kn_data

Unnamed: 0,technique,classifier,neighbors,scores
0,nearest neighbors,KNeighborsClassifier(n_neighbors=1),1,0.818841
1,nearest neighbors,KNeighborsClassifier(n_neighbors=2),2,0.746377
2,nearest neighbors,KNeighborsClassifier(n_neighbors=3),3,0.768116
3,nearest neighbors,KNeighborsClassifier(n_neighbors=4),4,0.717391
4,nearest neighbors,KNeighborsClassifier(),5,0.724638
5,nearest neighbors,KNeighborsClassifier(n_neighbors=6),6,0.652174
6,nearest neighbors,KNeighborsClassifier(n_neighbors=7),7,0.695652
7,nearest neighbors,KNeighborsClassifier(n_neighbors=8),8,0.681159
8,nearest neighbors,KNeighborsClassifier(n_neighbors=9),9,0.695652


In [56]:
from sklearn.neural_network import MLPClassifier

shapes = []
activations = []
learn_rates = []
scores = []
classifiers = []
technique = []

for n_layers in range(1, 6):
    for layer_size in range(4, 20):
        for activation in ["logistic", "tanh", "relu"]:
            for learn_rate in ["constant", "invscaling", "adaptive"]:
                shape = tuple(layer_size for _ in range(n_layers))
                shapes.append(shape)
                activations.append(activation)
                learn_rates.append(learn_rate)
                nn = MLPClassifier(hidden_layer_sizes=shape, activation=activation, learning_rate=learn_rate, random_state=19)
                nn.fit(x_train, y_train)
                classifiers.append(nn)
                technique.append("Multi Layer Perceptron")
                scores.append(nn.score(x_test, y_test))

nn_data = pd.DataFrame(data={"technique": technique, "classifier": classifiers, "scores": scores, "hidden_layer_sizes": shapes, "activation": activations, "learning_rate": learn_rates})
nn_data



Unnamed: 0,technique,classifier,scores,hidden_layer_sizes,activation,learning_rate
0,Multi Layer Perceptron,"MLPClassifier(activation='logistic', hidden_la...",0.442029,"(4,)",logistic,constant
1,Multi Layer Perceptron,"MLPClassifier(activation='logistic', hidden_la...",0.442029,"(4,)",logistic,invscaling
2,Multi Layer Perceptron,"MLPClassifier(activation='logistic', hidden_la...",0.442029,"(4,)",logistic,adaptive
3,Multi Layer Perceptron,"MLPClassifier(activation='tanh', hidden_layer_...",0.434783,"(4,)",tanh,constant
4,Multi Layer Perceptron,"MLPClassifier(activation='tanh', hidden_layer_...",0.434783,"(4,)",tanh,invscaling
...,...,...,...,...,...,...
715,Multi Layer Perceptron,"MLPClassifier(activation='tanh', hidden_layer_...",0.572464,"(19, 19, 19, 19, 19)",tanh,invscaling
716,Multi Layer Perceptron,"MLPClassifier(activation='tanh', hidden_layer_...",0.572464,"(19, 19, 19, 19, 19)",tanh,adaptive
717,Multi Layer Perceptron,"MLPClassifier(hidden_layer_sizes=(19, 19, 19, ...",0.557971,"(19, 19, 19, 19, 19)",relu,constant
718,Multi Layer Perceptron,"MLPClassifier(hidden_layer_sizes=(19, 19, 19, ...",0.557971,"(19, 19, 19, 19, 19)",relu,invscaling


In [59]:
nn_data.sort_values(by="scores", inplace=False, ascending=False)

Unnamed: 0,technique,classifier,scores,hidden_layer_sizes,activation,learning_rate
382,Multi Layer Perceptron,"MLPClassifier(activation='tanh', hidden_layer_...",0.659420,"(14, 14, 14)",tanh,invscaling
381,Multi Layer Perceptron,"MLPClassifier(activation='tanh', hidden_layer_...",0.659420,"(14, 14, 14)",tanh,constant
30,Multi Layer Perceptron,"MLPClassifier(activation='tanh', hidden_layer_...",0.659420,"(7,)",tanh,constant
32,Multi Layer Perceptron,"MLPClassifier(activation='tanh', hidden_layer_...",0.659420,"(7,)",tanh,adaptive
31,Multi Layer Perceptron,"MLPClassifier(activation='tanh', hidden_layer_...",0.659420,"(7,)",tanh,invscaling
...,...,...,...,...,...,...
223,Multi Layer Perceptron,"MLPClassifier(hidden_layer_sizes=(12, 12), lea...",0.311594,"(12, 12)",relu,invscaling
224,Multi Layer Perceptron,"MLPClassifier(hidden_layer_sizes=(12, 12), lea...",0.311594,"(12, 12)",relu,adaptive
187,Multi Layer Perceptron,"MLPClassifier(hidden_layer_sizes=(8, 8), learn...",0.289855,"(8, 8)",relu,invscaling
188,Multi Layer Perceptron,"MLPClassifier(hidden_layer_sizes=(8, 8), learn...",0.289855,"(8, 8)",relu,adaptive


In [5]:
target_dataset = pd.read_csv("features/v1/test.csv")
target_features = target_dataset.drop("loan_status", axis=1).drop("loan_id", axis=1)
target_ids = target_dataset.loan_id

In [6]:
probabilities = rf.predict_proba(target_features)
positive_probabilities = [row[0] for row in probabilities]

In [7]:
import csv
with open("predictions/v1/random_forest_resampled.csv", "w") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(["Id", "Predicted"])
    writer.writerows(zip(target_ids, positive_probabilities))
