In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.preprocessing import StandardScaler

import pandas as pd
import prepare_data
from matplotlib import pyplot as plt
import numpy as np
import submit_csv

In [None]:
train_df = prepare_data.titanic_data_shaping('./Data/train.csv')

In [None]:
features = ['Sex','Pclass','SibSp','Parch','Feature_Age','Feature_Title','Feature_Ticket_Number','Fare']

In [None]:
fig1, ax_roc_curve = plt.subplots()

def train_model(current_model,df,features,label):
    X_train = df[features].to_numpy()
    X_train = StandardScaler().fit_transform(X_train)
    y_train = train_df[[label]].to_numpy().ravel()
    current_model.fit(X_train,y_train)

    return current_model, X_train, y_train

def evaluate_model(current_model,X_train,y_train):
    
    cv_method = None
    if type(current_model) == SGDClassifier:
        model_name = 'SGDClassifier'
        cv_method = 'decision_function'
    if type(current_model) == RandomForestClassifier:
        model_name = 'RandomForestClassifier'
        cv_method = 'predict_proba'

    y_train_scores = cross_val_predict(current_model,X_train,y_train,cv=5,method=cv_method)
    y_train_pred = cross_val_predict(current_model,X_train,y_train,cv=3)

    if type(current_model) == RandomForestClassifier:
        y_train_scores = y_train_scores[:,1]
        
    precisions, recalls, thresholds = precision_recall_curve(y_train,y_train_scores)

    fig1, ax_precision_vs_recall = plt.subplots()
    ax_precision_vs_recall.plot(thresholds,precisions[:-1],'-',label='Precision')
    ax_precision_vs_recall.plot(thresholds,recalls[:-1],'--',label='Recall')
    ax_precision_vs_recall.legend()
    ax_precision_vs_recall.grid()
    ax_precision_vs_recall.set_title(model_name)

    fpr, tpr, thresholds = roc_curve(y_train,y_train_scores)
    ax_roc_curve.plot(fpr,tpr,label=model_name)
    ax_roc_curve.plot([0,1],[0,1],'k--')
    ax_roc_curve.legend()


    print('{} : roc_auc_score = {}'.format(model_name,roc_auc_score(y_train,y_train_scores)))

    return ax_recall_vs_precision, ax_roc_curve

# y_train_scores = cross_val_predict(randomforest_model,X_train,y_train,cv=3,method=)
# fpr_forest , tpr_forest, thresholds_forest = roc_curve(y_train,y_train_scores[:,1])

sgd_model, X_train, y_train = train_model(SGDClassifier(random_state=42),train_df,features,'Survived')
rdnforest_model, X_train, y_train = train_model(RandomForestClassifier(random_state=42,max_features=4),train_df,features,'Survived')

evaluate_model(sgd_model,X_train,y_train)
evaluate_model(rdnforest_model,X_train,y_train)



In [None]:
param_grid = [
    {'n_estimators':[75, 100, 125], 'max_features':[3, 4, 5]},
] 

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=4, scoring='roc_auc',return_train_score=True)

grid_search.fit(X_train,y_train)

grid_search.best_params_

In [None]:
test_df = prepare_data.titanic_data_shaping('./Data/test.csv')
X_topredict = test_df[features].to_numpy()
qzdjbkl = StandardScaler().fit_transform(X=X_topredict)

In [None]:
submit_csv.from_classifier(test_df,features,randomforest_model)

In [None]:
example_csv = pd.read_csv('./Data/gender_submission.csv')
prediction_csv = pd.read_csv('./deeplearning_submission.csv')
example_csv.Survived.value_counts(), prediction_csv.Survived.value_counts()