In [1]:
import math
from scipy import stats
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
import statistics
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz
from sklearn.impute import SimpleImputer


from env import user, host, password, get_db_url
import acquire as aq
import prepare as prep

import warnings
warnings.filterwarnings("ignore")

# Decision Tree Exercises

## Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

**1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.**

In [2]:
#acquire data

df = aq.get_titanic_data()
df.head()

Using cached csv...


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
# Prep/clean data
df = prep.prep_titanic(df)
df.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,0,1
3,1,1,1,0,53.1,0,0,0,1
4,0,3,0,0,8.05,1,1,0,1


In [4]:
#Split the data
train, validate, test = prep.train_validate_test_split(df, target= 'survived')
print(train.shape)
print(validate.shape)
print(test.shape)

(498, 9)
(214, 9)
(179, 9)


In [5]:
train.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,0,1,0,0,40.125,1,1,0,0
165,1,3,0,2,20.525,0,1,0,1
50,0,3,4,1,39.6875,0,1,0,1
259,1,2,0,1,26.0,0,0,0,1
306,1,1,0,0,110.8833,1,0,0,0


In [6]:
# Create the baseline model

# possibility for adding baseline as column == train['baseline_assumption_death'] = 0


baseline = (train.survived == 0).mean()
print(f'Our baseline accuracy is {baseline}. We can compare our model accuracy to this figure.')

Our baseline accuracy is 0.6164658634538153. We can compare our model accuracy to this figure.


**2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)**

In [7]:
# Creating x & y version of train, validate, test
X_train = train.drop(columns='survived')
y_train = train.survived

X_validate = validate.drop(columns='survived')
y_validate = validate.survived

X_test = test.drop(columns='survived')
y_test = test.survived

In [8]:
#Sanity check -- no data leakage or contamination 
X_train.head()

Unnamed: 0,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,1,0,0,40.125,1,1,0,0
165,3,0,2,20.525,0,1,0,1
50,3,4,1,39.6875,0,1,0,1
259,2,0,1,26.0,0,0,0,1
306,1,0,0,110.8833,1,0,0,0


In [9]:
#Create and fit our model

clf1 = DecisionTreeClassifier()

clf1 = clf1.fit(X_train, y_train)

In [10]:
# Add model prediction to our dataframe
train['prediction'] = clf1.predict(X_train)
train.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,prediction
583,0,1,0,0,40.125,1,1,0,0,0
165,1,3,0,2,20.525,0,1,0,1,1
50,0,3,4,1,39.6875,0,1,0,1,0
259,1,2,0,1,26.0,0,0,0,1,1
306,1,1,0,0,110.8833,1,0,0,0,1


**3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.**

**4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.**

In [11]:
X = X_train
y = y_train

accuracy = clf1.score(X, y)
y_pred = clf1.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
conf


The accuracy for our model is 0.9458
The True Positive Rate is 0.869, The False Positive Rate is 0.00651,
The True Negative Rate is 0.993, and the False Negative Rate is 0.131



array([[305,   2],
       [ 25, 166]])

In [12]:
# Classification report
class_report

Unnamed: 0,precision,recall,f1-score,support
0,0.924242,0.993485,0.957614,307.0
1,0.988095,0.86911,0.924791,191.0
accuracy,0.945783,0.945783,0.945783,0.945783
macro avg,0.956169,0.931298,0.941202,498.0
weighted avg,0.948732,0.945783,0.945025,498.0


In [13]:
#Turn our confusion matrix into a dataframe
conf_df = pd.DataFrame(conf, columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])
conf_df

Unnamed: 0,predict_death,predict_survive
actual_death,305,2
actual_survive,25,166


**5. Run through steps 2-4 using a different max_depth value.**

In [14]:
#Create and fit our model

clf2 = DecisionTreeClassifier(max_depth=3, random_state=123)

clf2 = clf2.fit(X_train, y_train)

In [15]:
# Creating predictions and evaluating 

X = X_train
y = y_train

accuracy = clf2.score(X, y)
y_pred = clf2.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
conf


The accuracy for our model is 0.8233
The True Positive Rate is 0.702, The False Positive Rate is 0.101,
The True Negative Rate is 0.899, and the False Negative Rate is 0.298



array([[276,  31],
       [ 57, 134]])

In [16]:
# Classification report
class_report

Unnamed: 0,precision,recall,f1-score,support
0,0.828829,0.899023,0.8625,307.0
1,0.812121,0.701571,0.752809,191.0
accuracy,0.823293,0.823293,0.823293,0.823293
macro avg,0.820475,0.800297,0.807654,498.0
weighted avg,0.822421,0.823293,0.82043,498.0


**6. Which model performs better on your in-sample data?**

In [17]:
model1_accuracy = clf1.score(X, y)
model2_accuracy = clf2.score(X, y)
print(model1_accuracy)
print(model2_accuracy)
print('My model 1 performed better on in-sample data.')

0.9457831325301205
0.8232931726907631
My model 1 performed better on in-sample data.


**7. Which model performs best on your out-of-sample data, the validate set?**

In [18]:
y_val_pred_1 = clf1.predict(X_validate)
y_val_pred_2 = clf2.predict(X_validate)

In [19]:
model1_validate_accuracy = clf1.score(X_validate, y_validate)
model2_validate_accuracy = clf2.score(X_validate, y_validate)

In [20]:
print(model1_validate_accuracy)
print(model2_validate_accuracy)
print('My model 2 performed better on the validate data set.')

0.7523364485981309
0.7850467289719626
My model 2 performed better on the validate data set.


# Random Forest Exercises

**1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.**

In [21]:
# Create and fit Random Forest Model

clf = RandomForestClassifier(min_samples_leaf=1, max_depth=10, random_state=123)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

**2. Evaluate your results using the model score, confusion matrix, and classification report.**

**3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.**

In [22]:
# Creating predictions and evaluating
accuracy = clf.score(X, y)
y_pred = clf.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
conf


The accuracy for our model is 0.9438
The True Positive Rate is 0.885, The False Positive Rate is 0.0195,
The True Negative Rate is 0.98, and the False Negative Rate is 0.115



array([[301,   6],
       [ 22, 169]])

In [23]:
# Classification report
class_report

Unnamed: 0,precision,recall,f1-score,support
0,0.931889,0.980456,0.955556,307.0
1,0.965714,0.884817,0.923497,191.0
accuracy,0.943775,0.943775,0.943775,0.943775
macro avg,0.948801,0.932636,0.939526,498.0
weighted avg,0.944862,0.943775,0.94326,498.0


**4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.**

In [24]:
# Create and fit Random Forest Model

clf = RandomForestClassifier(min_samples_leaf=3, max_depth=5, random_state=123)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=3, random_state=123)

In [25]:
# Creating predictions and evalutating

accuracy = clf.score(X, y)
y_pred = clf.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
conf


The accuracy for our model is 0.8373
The True Positive Rate is 0.665, The False Positive Rate is 0.0554,
The True Negative Rate is 0.945, and the False Negative Rate is 0.335



array([[290,  17],
       [ 64, 127]])

In [26]:
# Classification report
class_report

Unnamed: 0,precision,recall,f1-score,support
0,0.819209,0.944625,0.877458,307.0
1,0.881944,0.664921,0.758209,191.0
accuracy,0.837349,0.837349,0.837349,0.837349
macro avg,0.850577,0.804773,0.817834,498.0
weighted avg,0.84327,0.837349,0.831722,498.0


**5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?**

The first model had lower mimimum sample leafs and higher max depth allowing the model treest to be very extensive and to fit very well to this specific data. Adjusting for a higher minimum sample leafs and lower max_depth makes led to worse in-sample accuracy because every tree in the random forest is smaller while and the higher min_samples_leaf restricts every sample. However, this approach avoids overfiting and the second model should perform better on the validate set and other sets "in the wild". 


# KNN Exercises

**1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)**

In [27]:
# Create and fit K-Nearest Neighbor model 

knn = KNeighborsClassifier()
knn = knn.fit(X_train, y_train)

**2. Evaluate your results using the model score, confusion matrix, and classification report.**

**3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.**

In [28]:
# Creating predictions and evaluating 
accuracy = knn.score(X, y)
y_pred = knn.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
conf


The accuracy for our model is 0.8072
The True Positive Rate is 0.717, The False Positive Rate is 0.137,
The True Negative Rate is 0.863, and the False Negative Rate is 0.283



array([[265,  42],
       [ 54, 137]])

In [29]:
# Classification report
class_report

Unnamed: 0,precision,recall,f1-score,support
0,0.830721,0.863192,0.846645,307.0
1,0.765363,0.717277,0.740541,191.0
accuracy,0.807229,0.807229,0.807229,0.807229
macro avg,0.798042,0.790235,0.793593,498.0
weighted avg,0.805654,0.807229,0.805951,498.0


**4. Run through steps 2-4 setting k to 10**

In [30]:
# Create and fit K-Nearest Neighbor model 

knn10 = KNeighborsClassifier(n_neighbors = 10)
knn10 = knn10.fit(X_train, y_train)

In [31]:
# Creating predictions and evaluating 
accuracy = knn10.score(X, y)
y_pred = knn10.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
conf


The accuracy for our model is 0.7831
The True Positive Rate is 0.644, The False Positive Rate is 0.13,
The True Negative Rate is 0.87, and the False Negative Rate is 0.356



array([[267,  40],
       [ 68, 123]])

In [32]:
# Classification report
class_report

Unnamed: 0,precision,recall,f1-score,support
0,0.797015,0.869707,0.831776,307.0
1,0.754601,0.643979,0.694915,191.0
accuracy,0.783133,0.783133,0.783133,0.783133
macro avg,0.775808,0.756843,0.763345,498.0
weighted avg,0.780748,0.783133,0.779285,498.0


**5. Run through setps 2-4 setting k to 20**

In [33]:
# Create and fit K-Nearest Neighbor model 

knn20 = KNeighborsClassifier(n_neighbors = 20)
knn20 = knn20.fit(X_train, y_train)

In [34]:
# Creating predictions and evaluating 

accuracy = knn20.score(X, y)
y_pred = knn20.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
conf


The accuracy for our model is 0.7369
The True Positive Rate is 0.545, The False Positive Rate is 0.143,
The True Negative Rate is 0.857, and the False Negative Rate is 0.455



array([[263,  44],
       [ 87, 104]])

In [35]:
# Classification report
class_report

Unnamed: 0,precision,recall,f1-score,support
0,0.751429,0.856678,0.800609,307.0
1,0.702703,0.544503,0.613569,191.0
accuracy,0.736948,0.736948,0.736948,0.736948
macro avg,0.727066,0.70059,0.707089,498.0
weighted avg,0.732741,0.736948,0.728873,498.0


**6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?**

The first model seems to perform best on every metric for our in-sample data. I think this is because having K set to a lower number reduces dimensionality creating a more effective model for this data set. 

**7. Which model performs best on our out-of-sample data from validate?**

In [36]:
# Comparing model accuracy for in-same and out-of-sample data

print(f'''The k5 in-sample score is {knn.score(X_train, y_train):.3}''')
print(f'''The k5 out-of-sample score is {knn.score(X_validate, y_validate):.3}''')

print(f'''The k10 in-sample score is {knn10.score(X_train, y_train):.3}''')
print(f'''The k10 out-of-sample score is {knn10.score(X_validate, y_validate):.3}''')

print(f'''The k20 in-sample score is {knn20.score(X_train, y_train):.3}''')
print(f'The k20 out-of-sample score is {knn20.score(X_validate, y_validate):.3}''')

The k5 in-sample score is 0.807
The k5 out-of-sample score is 0.743
The k10 in-sample score is 0.783
The k10 out-of-sample score is 0.715
The k20 in-sample score is 0.737
The k20 out-of-sample score is 0.673


*Our first model using the default k value of 5 performed best on both in-sample and out-of-sample data.*

## Logistic Regression Exercises

**1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?**

In [37]:
## Create our dateframe from scratch
# acquire data using my predefined function
# Prepping data with these specifications

df = aq.get_titanic_data()

def prep_titanic(df):
    df.drop_duplicates(inplace= True)
    #Drop unnecessary columns
    columns_to_drop = ['deck', 'embarked', 'class', 'passenger_id']
    df = df.drop(columns = columns_to_drop)
    #Fill nulls for age and embark_town
    age_mean = df.age.mean()
    df['age'] = df.age.fillna(age_mean)
    df['embark_town'] = df.embark_town.fillna('Southampton')
    #Create dummies to encode sex and embark town
    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], 
                         dummy_na = False, 
                         drop_first = [True, True])
    df = pd.concat([df, dummy_df], axis = 1)
    return df.drop(columns= ['sex', 'embark_town'])
    

df = prep_titanic(df)
df.head()

Using cached csv...


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1


In [38]:
#sanity check
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 891 non-null    int64  
 1   pclass                   891 non-null    int64  
 2   age                      891 non-null    float64
 3   sibsp                    891 non-null    int64  
 4   parch                    891 non-null    int64  
 5   fare                     891 non-null    float64
 6   alone                    891 non-null    int64  
 7   sex_male                 891 non-null    uint8  
 8   embark_town_Queenstown   891 non-null    uint8  
 9   embark_town_Southampton  891 non-null    uint8  
dtypes: float64(2), int64(5), uint8(3)
memory usage: 58.3 KB


In [39]:
#Split the data
train, validate, test = prep.train_validate_test_split(df, target= 'survived')
print(train.shape)
print(validate.shape)
print(test.shape)

(498, 10)
(214, 10)
(179, 10)


In [40]:
# Creating x & y version of train, validate, test
X_train = train.drop(columns='survived')
y_train = train.survived

X_validate = validate.drop(columns='survived')
y_validate = validate.survived

X_test = test.drop(columns='survived')
y_test = test.survived

In [41]:
#sanity check
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,1,36.0,0,0,40.125,1,1,0,0
165,3,9.0,0,2,20.525,0,1,0,1
50,3,7.0,4,1,39.6875,0,1,0,1
259,2,50.0,0,1,26.0,0,0,0,1
306,1,29.699118,0,0,110.8833,1,0,0,0


In [42]:
# Create our logistic regression model using specified features

features = ['age', 'pclass', 'fare']
logit1 = LogisticRegression(random_state=123)
logit1 = logit1.fit(X_train[features], y_train)

In [43]:
# Creating predictions and evaluating 

X = X_train[features]
y = y_train

accuracy = logit1.score(X, y)
y_pred = logit1.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.7028
The True Positive Rate is 0.44, The False Positive Rate is 0.134,
The True Negative Rate is 0.866, and the False Negative Rate is 0.56



Unnamed: 0,precision,recall,f1-score,support
0,0.713137,0.86645,0.782353,307.0
1,0.672,0.439791,0.531646,191.0
accuracy,0.702811,0.702811,0.702811,0.702811
macro avg,0.692568,0.65312,0.656999,498.0
weighted avg,0.697359,0.702811,0.686198,498.0


In [44]:
#Calculate baseline
baseline = (train.survived == 0).mean()
print(f'Our baseline accuracy is {baseline}. We can compare our model accuracy to this figure.')

Our baseline accuracy is 0.6164658634538153. We can compare our model accuracy to this figure.


*On in sample data, this model does perform better than the baseline.*

**2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.**

In [45]:
# Create our logistic regression model

features2= ['age', 'pclass', 'fare', 'sex_male']
logit2 = LogisticRegression(random_state=123)
logit2 = logit2.fit(X_train[features2], y_train)

In [46]:
# Creating predictions and evaluating 

X = X_train[features2]
y = y_train

accuracy = logit2.score(X, y)
y_pred = logit2.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.8133
The True Positive Rate is 0.728, The False Positive Rate is 0.134,
The True Negative Rate is 0.866, and the False Negative Rate is 0.272



Unnamed: 0,precision,recall,f1-score,support
0,0.836478,0.86645,0.8512,307.0
1,0.772222,0.727749,0.749326,191.0
accuracy,0.813253,0.813253,0.813253,0.813253
macro avg,0.80435,0.797099,0.800263,498.0
weighted avg,0.811834,0.813253,0.812128,498.0


*This model, which includes sex, performs significantly better than the first.*

**3. Try out other combinations of features and models.**

*Our 3rd model will include features: pclass, age, fare, sex, and embark_town.*

In [47]:
# Our third logistic regression model

features3 = ['pclass', 'age', 'fare', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton']
logit3 = LogisticRegression(random_state=123)
logit3 = logit3.fit(X_train[features3], y_train)

In [48]:
# Creating predictions and evaluating 

X = X_train[features3]

accuracy = logit3.score(X, y)
y_pred = logit3.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.8032
The True Positive Rate is 0.728, The False Positive Rate is 0.15,
The True Negative Rate is 0.85, and the False Negative Rate is 0.272



Unnamed: 0,precision,recall,f1-score,support
0,0.833866,0.850163,0.841935,307.0
1,0.751351,0.727749,0.739362,191.0
accuracy,0.803213,0.803213,0.803213,0.803213
macro avg,0.792609,0.788956,0.790649,498.0
weighted avg,0.802219,0.803213,0.802595,498.0


*Our 4th model will include features: age, alone and sex. I will change model paramaters class_weight to 'balanced'.*

In [49]:
# Our fourth logistic regression model

features4 = ['age', 'alone', 'sex_male' ]
logit4 = LogisticRegression(random_state=123, class_weight='balanced')
logit4 = logit4.fit(X_train[features4], y_train)

In [50]:
# Creating predictions and evaluating 

X = X_train[features4]

accuracy = logit4.score(X, y)
y_pred = logit4.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.7992
The True Positive Rate is 0.696, The False Positive Rate is 0.137,
The True Negative Rate is 0.863, and the False Negative Rate is 0.304



Unnamed: 0,precision,recall,f1-score,support
0,0.820433,0.863192,0.84127,307.0
1,0.76,0.696335,0.726776,191.0
accuracy,0.799197,0.799197,0.799197,0.799197
macro avg,0.790217,0.779764,0.784023,498.0
weighted avg,0.797255,0.799197,0.797358,498.0


*Our 5th model will include features: sex, fare, pclass and I will adjust model parameters C to .2, and intercept_scaling to .5.*

In [51]:
# Our fifth logistic regression model

features5 = ['sex_male', 'fare', 'pclass' ]
logit5 = LogisticRegression(random_state=123, C=0.2, intercept_scaling=.5)
logit5 = logit5.fit(X_train[features5], y_train)

In [52]:
# Creating predictions and evaluating 

X = X_train[features5]

accuracy = logit5.score(X, y)
y_pred = logit5.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.7992
The True Positive Rate is 0.696, The False Positive Rate is 0.137,
The True Negative Rate is 0.863, and the False Negative Rate is 0.304



Unnamed: 0,precision,recall,f1-score,support
0,0.820433,0.863192,0.84127,307.0
1,0.76,0.696335,0.726776,191.0
accuracy,0.799197,0.799197,0.799197,0.799197
macro avg,0.790217,0.779764,0.784023,498.0
weighted avg,0.797255,0.799197,0.797358,498.0


**4. Use you best 3 models to predict and evaluate on your validate sample.**

*Model Scores*
- Model 1: .7028
- Model 2: .8133
- Model 3: .8032
- Model 4: .7992
- Model 5: .7992

*Our top 3 are Models 2, 3, 4(tied with 5).*

In [53]:
# Fit Model 2 on Validate

logit2 = LogisticRegression(random_state=123)
logit2 = logit2.fit(X_validate[features2], y_validate)

In [54]:
# Evaluating Model 2

X = X_validate[features2]
y = y_validate

accuracy = logit2.score(X, y)
y_pred = logit2.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.785
The True Positive Rate is 0.634, The False Positive Rate is 0.121,
The True Negative Rate is 0.879, and the False Negative Rate is 0.366



Unnamed: 0,precision,recall,f1-score,support
0,0.794521,0.878788,0.834532,132.0
1,0.764706,0.634146,0.693333,82.0
accuracy,0.785047,0.785047,0.785047,0.785047
macro avg,0.779613,0.756467,0.763933,214.0
weighted avg,0.783096,0.785047,0.780428,214.0


In [55]:
# Fit Model 3 on Validate
logit3 = LogisticRegression(random_state=123)
logit3 = logit3.fit(X_validate[features3], y_validate)

In [56]:
# Creating predictions and evaluating 

X = X_validate[features3]

accuracy = logit3.score(X, y)
y_pred = logit3.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.785
The True Positive Rate is 0.646, The False Positive Rate is 0.129,
The True Negative Rate is 0.871, and the False Negative Rate is 0.354



Unnamed: 0,precision,recall,f1-score,support
0,0.798611,0.871212,0.833333,132.0
1,0.757143,0.646341,0.697368,82.0
accuracy,0.785047,0.785047,0.785047,0.785047
macro avg,0.777877,0.758777,0.765351,214.0
weighted avg,0.782721,0.785047,0.781235,214.0


In [57]:
# Fit Model 4 on Validate

logit4 = LogisticRegression(random_state=123, class_weight='balanced')
logit4 = logit4.fit(X_validate[features4], y_validate)

In [58]:
X = X_validate[features4]

accuracy = logit4.score(X, y)
y_pred = logit4.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.7617
The True Positive Rate is 0.659, The False Positive Rate is 0.174,
The True Negative Rate is 0.826, and the False Negative Rate is 0.341



Unnamed: 0,precision,recall,f1-score,support
0,0.79562,0.825758,0.810409,132.0
1,0.701299,0.658537,0.679245,82.0
accuracy,0.761682,0.761682,0.761682,0.761682
macro avg,0.74846,0.742147,0.744827,214.0
weighted avg,0.759478,0.761682,0.76015,214.0


**5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?**

*Models 2 and 3 are virtually tied but two has slightly better accuracy and precision so I will choose Model 2 as my best.*

In [59]:
# Fit Model 2 on Test

logit2 = LogisticRegression(random_state=123)
logit2 = logit2.fit(X_test[features2], y_test)

In [60]:
# Evaluating Model 2 on Test

X = X_test[features2]
y = y_test

accuracy = logit2.score(X, y)
y_pred = logit2.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.7933
The True Positive Rate is 0.638, The False Positive Rate is 0.109,
The True Negative Rate is 0.891, and the False Negative Rate is 0.362



Unnamed: 0,precision,recall,f1-score,support
0,0.796748,0.890909,0.841202,110.0
1,0.785714,0.637681,0.704,69.0
accuracy,0.793296,0.793296,0.793296,0.793296
macro avg,0.791231,0.764295,0.772601,179.0
weighted avg,0.792495,0.793296,0.788314,179.0


*Model 2 perfomed better on test than on validate on all metrics. It has less overall accuracy and precision than on train but slightly better recall.*