In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import graphviz
from graphviz import Graph

import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

In [None]:
df = acquire.get_titanic_data()
df.head()

In [None]:
df = df.set_index("passenger_id")
df = df.drop(columns=["class", "embarked"])

In [None]:
df.isna().sum()

In [None]:
df = df.drop(columns=["deck"])

In [None]:
df.embark_town = df.embark_town.fillna(value=df.embark_town.mode())

In [None]:
no_age_info = df[df.age.isna()]
no_age_info.alone.value_counts()

In [None]:
df.fare.hist(), no_age_info.fare.hist();

In [None]:
for column in df.drop(columns=["age", "fare"]).columns:
    print(column)
    print("Population:")
    print(df[column].value_counts(normalize=True))
    print("No age")
    print(no_age_info[column].value_counts(normalize=True))
    print()
    print()

In [None]:
df.age = df.age.fillna(value=df.age.median())

In [None]:
# Time to encode the encodeable!
dummy_df = pd.get_dummies(df[['sex','embark_town']], dummy_na=False, drop_first=[True, True])

# Drop the original columns we encoded
df = df.drop(columns=["sex", "embark_town"])

# Stitch the df and the dummy_df together again
df = pd.concat([df, dummy_df], axis=1)
df.head()

In [None]:
train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)

In [None]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [None]:
# The mode is a great baseline
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = y_train == 0

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
# Make the model
tree1 = DecisionTreeClassifier(max_depth=1, random_state=123)

# Fit the model (on train and only train)
tree1 = tree1.fit(X_train, y_train)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions = tree1.predict(X_train)

# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of 1 depth")
pd.DataFrame(report)

In [None]:
for i in range(2, 11):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = tree.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

In [None]:
metrics = []

for i in range(2, 25):
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    tree = tree.fit(X_train, y_train)

    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

# Decision Tree Exercises

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import graphviz
from graphviz import Graph

import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

In [None]:
df = acquire.get_titanic_data()
df.head()

In [None]:
df = df.set_index("passenger_id")
df = df.drop(columns=["class", "embarked"])

In [None]:
df.isna().sum()

In [None]:
df = df.drop(columns=["deck"])

In [None]:
df.embark_town = df.embark_town.fillna(value=df.embark_town.mode())

In [None]:
no_age_info = df[df.age.isna()]
no_age_info.alone.value_counts()

In [None]:
for column in df.drop(columns=["age", "fare"]).columns:
    print(column)
    print("Population:")
    print(df[column].value_counts(normalize=True))
    print("No age")
    print(no_age_info[column].value_counts(normalize=True))
    print('----------------------------------')

In [None]:
df.age = df.age.fillna(value=df.age.median())

In [None]:
dummy_df = pd.get_dummies(df[['sex','embark_town']], dummy_na=False, drop_first=[True, True])

df = df.drop(columns=["sex", "embark_town"])

df = pd.concat([df, dummy_df], axis=1)
df.head()

In [None]:
train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [None]:
baseline = y_train.mode()

matches_baseline_prediction = y_train == 0

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
forest1 = RandomForestClassifier(max_depth=1, random_state=123)

tree1 = forest1.fit(X_train, y_train)


y_predictions = forest1.predict(X_train)

report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of 1 depth")
pd.DataFrame(report)

In [None]:
for i in range(2, 11):
    forest = RandomForestClassifier(max_depth=i, random_state=123)

    forest = forest.fit(X_train, y_train)

    y_predictions = forest.predict(X_train)

    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print('-------------------------------------------------------------------------------')

In [None]:
metrics = []

for i in range(2, 25):
    forest = RandomForestClassifier(max_depth=i, random_state=123)

    forest = forest.fit(X_train, y_train)

    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)

In [None]:
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
threshold = 0.10

models = []
metrics = []

for i in range(2, 25):
    forest = RandomForestClassifier(max_depth=i, min_samples_leaf=1, random_state=123)

    forest = forest.fit(X_train, y_train)

    in_sample_accuracy = forest.score(X_train, y_train)   
    out_of_sample_accuracy = forest.score(X_validate, y_validate)

    difference = in_sample_accuracy - out_of_sample_accuracy
    
    if difference > threshold:
        break
    
    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy,
        "difference": difference
    }
    
    metrics.append(output)
    
    models.append(forest)
    
df = pd.DataFrame(metrics)
df

In [None]:
metrics = []
max_depth = 20

for i in range(2, max_depth):
    depth = max_depth - i
    n_samples = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n_samples, random_state=123)

    forest = forest.fit(X_train, y_train)

    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)

    output = {
        "min_samples_per_leaf": n_samples,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
metrics = []
max_depth = 20

for i in range(2, max_depth):
    # Make the model
    depth = i
    n_samples = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n_samples, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)

    output = {
        "min_samples_per_leaf": n_samples,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
metrics = []
max_depth = 50

for i in range(2, max_depth):
    depth = 10
    n_samples = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n_samples, random_state=123)

    forest = forest.fit(X_train, y_train)

    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)

    output = {
        "min_samples_per_leaf": n_samples,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

# KNN Exercises

#### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

#### 4. Run through steps 2-4 setting k to 10

#### 5. Run through steps 2-4 setting k to 20

#### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

#### 7. Which model performs best on our out-of-sample data from validate?

## 1.

In [29]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score

import prepare
import acquire

# read Titanic data from sql
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [30]:
# split data 
train = df
validate = df
test = df

In [37]:
X_train = train.drop(columns=['survived', 'sex', 'embark_town'])
y_train = train.survived

X_validate = validate.drop(columns=['survived', 'sex', 'embark_town'])
y_validate = validate.survived

X_test = test.drop(columns=['survived', 'sex', 'embark_town'])
y_test = test.survived

In [38]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [39]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

## 2.

In [40]:
#compute accuracy score
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.74


In [42]:
y_pred = knn.predict(X_train)
print(confusion_matrix(y_train, y_pred))

[[483  66]
 [163 179]]


In [43]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81       549
           1       0.73      0.52      0.61       342

    accuracy                           0.74       891
   macro avg       0.74      0.70      0.71       891
weighted avg       0.74      0.74      0.73       891



## 3.

In [44]:
#create a function to calculate the metrics
def get_metrics_knn(knn, X, y):
    y_pred = knn.predict(X)
    accuracy = knn.score(X, y)
    conf = confusion_matrix(y, y_pred)
    class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    prfs = pd.DataFrame(precision_recall_fscore_support(y, y_pred), index=['precision', 'recall', 'f1-score', 'support'])
    
    print(f'''
    The accuracy for our model is: {accuracy:.2%}
    The True Positive Rate is: {tpr:.2%}
    The False Positive Rate is: {fpr:.2%}
    The True Negative Rate is: {tnr:.2%}
    The False Negative Rate is: {fnr:.2%}

    
    
    ''')
    return prfs, class_report

In [45]:
a, b = get_metrics_knn(knn, X_train, y_train)
display(a, b)


    The accuracy for our model is: 74.30%
    The True Positive Rate is: 52.34%
    The False Positive Rate is: 12.02%
    The True Negative Rate is: 87.98%
    The False Negative Rate is: 47.66%

    
    
    


Unnamed: 0,0,1
precision,0.747678,0.730612
recall,0.879781,0.523392
f1-score,0.808368,0.609881
support,549.0,342.0


Unnamed: 0,precision,recall,f1-score,support
0,0.747678,0.879781,0.808368,549.0
1,0.730612,0.523392,0.609881,342.0
accuracy,0.742985,0.742985,0.742985,0.742985
macro avg,0.739145,0.701587,0.709124,891.0
weighted avg,0.741128,0.742985,0.732181,891.0


## 4.

In [46]:
# Create KNN object
# weights = ['uniform', 'density']
knn10 = KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [47]:
# fit the model
knn10.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [48]:
c, d = get_metrics_knn(knn10, X_train, y_train)
display(c, d)


    The accuracy for our model is: 70.03%
    The True Positive Rate is: 35.67%
    The False Positive Rate is: 8.56%
    The True Negative Rate is: 91.44%
    The False Negative Rate is: 64.33%

    
    
    


Unnamed: 0,0,1
precision,0.695291,0.721893
recall,0.91439,0.356725
f1-score,0.789929,0.477495
support,549.0,342.0


Unnamed: 0,precision,recall,f1-score,support
0,0.695291,0.91439,0.789929,549.0
1,0.721893,0.356725,0.477495,342.0
accuracy,0.700337,0.700337,0.700337,0.700337
macro avg,0.708592,0.635557,0.633712,891.0
weighted avg,0.705502,0.700337,0.670005,891.0


## 5.

In [49]:
# Create KNN object
# weights = ['uniform', 'density']
knn20 = KNeighborsClassifier(n_neighbors=20, weights='uniform')

In [50]:
# fit the model
knn20.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [51]:
e, f = get_metrics_knn(knn20, X_train, y_train)
display(e, f)


    The accuracy for our model is: 68.57%
    The True Positive Rate is: 29.82%
    The False Positive Rate is: 7.29%
    The True Negative Rate is: 92.71%
    The False Negative Rate is: 70.18%

    
    
    


Unnamed: 0,0,1
precision,0.679573,0.71831
recall,0.92714,0.298246
f1-score,0.784284,0.421488
support,549.0,342.0


Unnamed: 0,precision,recall,f1-score,support
0,0.679573,0.92714,0.784284,549.0
1,0.71831,0.298246,0.421488,342.0
accuracy,0.685746,0.685746,0.685746,0.685746
macro avg,0.698941,0.612693,0.602886,891.0
weighted avg,0.694442,0.685746,0.645029,891.0


## 6.

In [52]:
a, b = get_metrics_knn(knn10, X_train, y_train)
display(a, b)


    The accuracy for our model is: 70.03%
    The True Positive Rate is: 35.67%
    The False Positive Rate is: 8.56%
    The True Negative Rate is: 91.44%
    The False Negative Rate is: 64.33%

    
    
    


Unnamed: 0,0,1
precision,0.695291,0.721893
recall,0.91439,0.356725
f1-score,0.789929,0.477495
support,549.0,342.0


Unnamed: 0,precision,recall,f1-score,support
0,0.695291,0.91439,0.789929,549.0
1,0.721893,0.356725,0.477495,342.0
accuracy,0.700337,0.700337,0.700337,0.700337
macro avg,0.708592,0.635557,0.633712,891.0
weighted avg,0.705502,0.700337,0.670005,891.0


In [53]:
c, d = get_metrics_knn(knn10, X_train, y_train)
display(c, d)


    The accuracy for our model is: 70.03%
    The True Positive Rate is: 35.67%
    The False Positive Rate is: 8.56%
    The True Negative Rate is: 91.44%
    The False Negative Rate is: 64.33%

    
    
    


Unnamed: 0,0,1
precision,0.695291,0.721893
recall,0.91439,0.356725
f1-score,0.789929,0.477495
support,549.0,342.0


Unnamed: 0,precision,recall,f1-score,support
0,0.695291,0.91439,0.789929,549.0
1,0.721893,0.356725,0.477495,342.0
accuracy,0.700337,0.700337,0.700337,0.700337
macro avg,0.708592,0.635557,0.633712,891.0
weighted avg,0.705502,0.700337,0.670005,891.0


In [54]:
e, f = get_metrics_knn(knn20, X_train, y_train)
display(e, f)


    The accuracy for our model is: 68.57%
    The True Positive Rate is: 29.82%
    The False Positive Rate is: 7.29%
    The True Negative Rate is: 92.71%
    The False Negative Rate is: 70.18%

    
    
    


Unnamed: 0,0,1
precision,0.679573,0.71831
recall,0.92714,0.298246
f1-score,0.784284,0.421488
support,549.0,342.0


Unnamed: 0,precision,recall,f1-score,support
0,0.679573,0.92714,0.784284,549.0
1,0.71831,0.298246,0.421488,342.0
accuracy,0.685746,0.685746,0.685746,0.685746
macro avg,0.698941,0.612693,0.602886,891.0
weighted avg,0.694442,0.685746,0.645029,891.0


In [None]:
# The first model seems to be working the best with the in-sample data
# because 5 NN is closer than 10 or 20.

## 7.

In [55]:
a, b = get_metrics_knn(knn, X_validate, y_validate)
display(a, b)


    The accuracy for our model is: 74.30%
    The True Positive Rate is: 52.34%
    The False Positive Rate is: 12.02%
    The True Negative Rate is: 87.98%
    The False Negative Rate is: 47.66%

    
    
    


Unnamed: 0,0,1
precision,0.747678,0.730612
recall,0.879781,0.523392
f1-score,0.808368,0.609881
support,549.0,342.0


Unnamed: 0,precision,recall,f1-score,support
0,0.747678,0.879781,0.808368,549.0
1,0.730612,0.523392,0.609881,342.0
accuracy,0.742985,0.742985,0.742985,0.742985
macro avg,0.739145,0.701587,0.709124,891.0
weighted avg,0.741128,0.742985,0.732181,891.0


In [56]:
c, d = get_metrics_knn(knn10, X_validate, y_validate)
display(c, d)


    The accuracy for our model is: 70.03%
    The True Positive Rate is: 35.67%
    The False Positive Rate is: 8.56%
    The True Negative Rate is: 91.44%
    The False Negative Rate is: 64.33%

    
    
    


Unnamed: 0,0,1
precision,0.695291,0.721893
recall,0.91439,0.356725
f1-score,0.789929,0.477495
support,549.0,342.0


Unnamed: 0,precision,recall,f1-score,support
0,0.695291,0.91439,0.789929,549.0
1,0.721893,0.356725,0.477495,342.0
accuracy,0.700337,0.700337,0.700337,0.700337
macro avg,0.708592,0.635557,0.633712,891.0
weighted avg,0.705502,0.700337,0.670005,891.0


In [57]:
e, f = get_metrics_knn(knn20, X_validate, y_validate)
display(e, f)


    The accuracy for our model is: 68.57%
    The True Positive Rate is: 29.82%
    The False Positive Rate is: 7.29%
    The True Negative Rate is: 92.71%
    The False Negative Rate is: 70.18%

    
    
    


Unnamed: 0,0,1
precision,0.679573,0.71831
recall,0.92714,0.298246
f1-score,0.784284,0.421488
support,549.0,342.0


Unnamed: 0,precision,recall,f1-score,support
0,0.679573,0.92714,0.784284,549.0
1,0.71831,0.298246,0.421488,342.0
accuracy,0.685746,0.685746,0.685746,0.685746
macro avg,0.698941,0.612693,0.602886,891.0
weighted avg,0.694442,0.685746,0.645029,891.0


#### Overall the 5NN model is better except for TNR(20NN), FNR (20NN) & precision(20NN)

# Logistic Regression Exercises

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

import prepare
import acquire

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [15]:
train = df
test = df
validate = df

In [16]:
# setting up baseline
train.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [17]:
baseline_accuracy = (train.survived == 0).mean()
baseline_accuracy

0.6161616161616161

In [18]:
# create X & y version of train, 
# where y is a series with just the target variable and 
# X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [19]:
logit = LogisticRegression(C=1, random_state=3210)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   passenger_id             891 non-null    int64  
 1   survived                 891 non-null    int64  
 2   pclass                   891 non-null    int64  
 3   sex                      891 non-null    object 
 4   sibsp                    891 non-null    int64  
 5   parch                    891 non-null    int64  
 6   fare                     891 non-null    float64
 7   embark_town              891 non-null    object 
 8   alone                    891 non-null    int64  
 9   sex_male                 891 non-null    uint8  
 10  embark_town_Queenstown   891 non-null    uint8  
 11  embark_town_Southampton  891 non-null    uint8  
dtypes: float64(1), int64(6), object(2), uint8(3)
memory usage: 72.2+ KB


In [21]:
# fit the model on train data
# using: age, fare and pclass
logit.fit(X_train[['fare', 'pclass']], y_train)

LogisticRegression(C=1, random_state=3210)

In [22]:
# now use the model to make predictions
y_pred = logit.predict(X_train[['fare', 'pclass']])

In [23]:
y_pred

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,

In [24]:
# View raw probabilities (output from the model)

y_pred_proba = logit.predict_proba(X_train[['fare', 'pclass']])
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['not-survived', 'survived'])
y_pred_proba.head()

Unnamed: 0,not-survived,survived
0,0.752362,0.247638
1,0.354238,0.645762
2,0.751656,0.248344
3,0.377903,0.622097
4,0.751525,0.248475


In [25]:
#create a function to calculate the metrics
def get_metrics_logit(logit, X, y):
    y_pred = logit.predict(X)
    accuracy = logit.score(X, y)
    conf = confusion_matrix(y, y_pred)
    class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True))
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    prfs = pd.DataFrame(precision_recall_fscore_support(y, y_pred), index=['precision', 'recall', 'f1-score', 'support'])
    print(f'''
    The accuracy for our model is: {accuracy:.2%}
    The True Positive Rate is: {tpr:.2%}
    The False Positive Rate is: {fpr:.2%}
    The True Negative Rate is: {tnr:.2%}
    The False Negative Rate is: {fnr:.2%}
    ''')

In [27]:
get_metrics_logit(logit, X_train[['fare', 'pclass']], y_train)


    The accuracy for our model is: 67.90%
    The True Positive Rate is: 39.77%
    The False Positive Rate is: 14.57%
    The True Negative Rate is: 85.43%
    The False Negative Rate is: 60.23%
    


## 2.

In [28]:
# create the object
# Define the logistic regression model
logit1 = LogisticRegression(random_state=3210)

In [39]:
df

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.2500,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.9250,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1000,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.0500,Southampton,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,0,0,13.0000,Southampton,1,1,0,1
887,887,1,1,female,0,0,30.0000,Southampton,1,0,0,1
888,888,0,3,female,1,2,23.4500,Southampton,0,0,0,1
889,889,1,1,male,0,0,30.0000,Cherbourg,1,1,0,0


In [None]:
dropcols = ['sex', 'embark_town']
df.drop(columns= dropcols, inplace=True)

In [43]:
# fit the model on train data
# using: age, fare and pclass
logit1.fit(X_train[['fare', 'pclass', 'sex_male']], y_train)

LogisticRegression(random_state=3210)

In [46]:
y_pred = logit1.predict(X_train[['fare', 'pclass', 'sex_male']])

In [47]:
y_pred

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,

In [49]:
#raw probabilities (output from the model)
y_pred_proba = logit1.predict_proba(X_train[['fare', 'pclass', 'sex_male']])
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['not-survived', 'survived'])
y_pred_proba.head()

Unnamed: 0,not-survived,survived
0,0.897798,0.102202
1,0.095468,0.904532
2,0.409013,0.590987
3,0.098074,0.901926
4,0.897677,0.102323


In [50]:
get_metrics_logit(logit1, X_train[['fare', 'pclass', 'sex_male']], y_train)


    The accuracy for our model is: 78.68%
    The True Positive Rate is: 68.71%
    The False Positive Rate is: 15.12%
    The True Negative Rate is: 84.88%
    The False Negative Rate is: 31.29%
    


In [55]:
# create the object
# Define the logistic regression model
logit2 = LogisticRegression(random_state=3210)

In [57]:
# create X & y version of train, 
# where y is a series with just the target variable and 
# X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [58]:
# fit the model on train data
# using: age, fare and pclass
logit2.fit(X_train, y_train)

LogisticRegression(random_state=3210)

In [59]:
y_pred = logit2.predict(X_train)

In [60]:
y_pred

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,

In [61]:
y_pred_proba = logit2.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['not-survived', 'survived'])
y_pred_proba.head()

Unnamed: 0,not-survived,survived
0,0.890109,0.109891
1,0.142208,0.857792
2,0.39431,0.60569
3,0.116521,0.883479
4,0.894187,0.105813


In [62]:
get_metrics_logit(logit2, X_train, y_train)


    The accuracy for our model is: 79.24%
    The True Positive Rate is: 66.96%
    The False Positive Rate is: 13.11%
    The True Negative Rate is: 86.89%
    The False Negative Rate is: 33.04%
    


## 4.

In [64]:
# logit metrics on validate
features = ['fare', 'pclass']

y_pred = logit.predict(X_validate[features])

get_metrics_logit(logit, X_validate[features], y_validate)


    The accuracy for our model is: 67.90%
    The True Positive Rate is: 39.77%
    The False Positive Rate is: 14.57%
    The True Negative Rate is: 85.43%
    The False Negative Rate is: 60.23%
    


In [66]:
# logit1 metrics on validate
features = ['fare', 'pclass', 'sex_male']

y_pred = logit1.predict(X_validate[features])

get_metrics_logit(logit1, X_validate[features], y_validate)


    The accuracy for our model is: 78.68%
    The True Positive Rate is: 68.71%
    The False Positive Rate is: 15.12%
    The True Negative Rate is: 84.88%
    The False Negative Rate is: 31.29%
    


In [67]:
# logit2 metrics on validate
y_pred = logit2.predict(X_validate)

get_metrics_logit(logit2, X_validate, y_validate)


    The accuracy for our model is: 79.24%
    The True Positive Rate is: 66.96%
    The False Positive Rate is: 13.11%
    The True Negative Rate is: 86.89%
    The False Negative Rate is: 33.04%
    


In [68]:
y_pred_validate = logit.predict(X_validate[['fare', 'pclass']])
y_pred_validate1 = logit1.predict(X_validate[['fare', 'pclass', 'sex_male']])
y_pred_validate2 = logit2.predict(X_validate)

## 5.

In [70]:
logit_5 = LogisticRegression(C=1, class_weight='balanced', random_state=123, intercept_scaling=1, solver='lbfgs')

In [72]:
logit_5.fit(X_train, y_train)

LogisticRegression(C=1, class_weight='balanced', random_state=123)

In [73]:
y_pred = logit_5.predict(X_test)
y_pred_proba = logit_5.predict_proba(X_test)

print("Model 5: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit_5.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 5: solver = lbfgs, c = 1
Accuracy: 0.78
[[459  90]
 [102 240]]
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       549
           1       0.73      0.70      0.71       342

    accuracy                           0.78       891
   macro avg       0.77      0.77      0.77       891
weighted avg       0.78      0.78      0.78       891

