In [33]:
import get_db
import explore
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Acquire data

In [3]:
#acquire data from the get_db import
df = get_db.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


# Prepare

#### Check for nulls, duplicates, fill in the blanks, encode strings for decision tree, split data.

In [4]:
df.isna().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
dtype: int64

In [5]:
#drop columns
df = df.drop(columns=["deck", "class", "embarked"])

In [6]:
#set index
df = df.set_index("passenger_id")

In [7]:
#fill in the 2 null embark_town values with the mode of listed towns 
df.embark_town = df.embark_town.fillna(value=df.embark_town.mode())

In [8]:
#fill in the age nulls with the average age
df.age = df.age.fillna(value=df.age.median())

# Clean data

In [9]:
#get_dummies creates a seperate df of booleans for the identified columns below. Cleaning for the decission tree.
dummy_df = pd.get_dummies(df[['sex','embark_town']], dummy_na=False, drop_first=[True, True])

In [10]:
#now drop the above two columns...
df = df.drop(columns=["sex", "embark_town"])
#...and concatanate the dummies df with the prep's df.
df = pd.concat([df, dummy_df], axis=1)
df.head()

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1


# Explore - Train, Validate and Test

In [11]:
#split data
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [12]:
#split into train, validate, test
train, validate, test = train_validate_test_split(df, target='survived')
train.head(2)

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
583,0,1,36.0,0,0,40.125,1,1,0,0
165,1,3,9.0,0,2,20.525,0,1,0,1


In [13]:
print(train.shape, validate.shape, test.shape)

(498, 10) (214, 10) (179, 10)


In [14]:
# create X & y version of train, where y is a series with just the target variable and X are all the features.
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

# Evaluate

In [15]:
# The mode is a great baseline
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = y_train == 0

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

Baseline accuracy: 0.62


# Model (decision Tree)

### Create the model

In [16]:
#create the model

#for classification you can change the algorithm to gini or entropy (information gain).  
#Default is gini.

model = DecisionTreeClassifier(max_depth=1, random_state=123)

### Fit the model

In [17]:
#Fit the model

model = model.fit(X_train, y_train)

### Use the model

In [18]:
#Use the model

# We'll evaluate the model's performance on train, first

y_predictions = model.predict(X_train)

In [19]:
# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of 1 depth")
pd.DataFrame(report)

Tree of 1 depth


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.820433,0.76,0.799197,0.790217,0.797255
recall,0.863192,0.696335,0.799197,0.779764,0.799197
f1-score,0.84127,0.726776,0.799197,0.784023,0.797358
support,307.0,191.0,0.799197,498.0,498.0


In [20]:
import graphviz
from graphviz import Graph

dot_data = export_graphviz(model, feature_names= X_train.columns, class_names=model.classes_, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

TypeError: can only concatenate str (not "numpy.int64") to str

In [21]:
#Create depth loop 
for i in range(2, 11):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = tree.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

Tree with max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Tree with max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.829341    0.817073  0.825301    0.823207      0.824636
recall       0.902280    0.701571  0.825301    0.801925      0.825301
f1-score     0.864275    0.754930  0.825301    0.809602      0.822337
support    307.000000  191.000000  0.825301  498.000000    498.000000

Tree with max depth of 4
                    0           1  accuracy   macro avg  weighted avg
precision    0.831858    0.842767  0.835341    0.837313      0.836042
recall       0.918567    0.701571  0.835341    0.810069      0.835341
f1-score     

# Random Forrest

#### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

#### Create the model

In [22]:
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', min_samples_leaf=1, n_estimators=100,
                            max_depth=10, 
                            random_state=123)

#### Fit the model

In [23]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

#### Use the model

In [24]:
#evaluate the weight
print(rf.feature_importances_)

[0.0904826  0.21647804 0.05389004 0.02936327 0.23995522 0.0178975
 0.31316541 0.01459049 0.02417742]


In [25]:
#estimate
y_pred = rf.predict(X_train)

In [26]:
#probability
y_pred_proba = rf.predict_proba(X_train)

In [27]:
#accuracy
print('Accuracy of random forest classifier on training set: {:.2f}'.format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [28]:
#confusion matrix
print(confusion_matrix(y_train, y_pred))

[[307   0]
 [ 15 176]]


In [29]:
#Create a classificaiton report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       307
           1       1.00      0.92      0.96       191

    accuracy                           0.97       498
   macro avg       0.98      0.96      0.97       498
weighted avg       0.97      0.97      0.97       498



#### Evaluate on Out-of-Sample data

In [30]:
print('Accuracy of random forest classifier on test set: {:.2f}'.format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.81


In [31]:
#Create depth loop 
for i in range (11, 2):
    # Make the model
    tree = RandomForestClassifier(max_depth=i, min_samples_leaf=2)

    # Fit the model (on train and only train)
    rf.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_pred = rf.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    print(classification_report(y_train, y_pred))

# KNN Exercise

In [34]:
#Create the model
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [35]:
#fit the model
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [36]:
#predict/use the model
y_pred = knn.predict(X_train)

In [37]:
#probability
y_pred_proba = knn.predict_proba(X_train)

In [38]:
#accuracy
print('Accuracy of KNN classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.79


In [39]:
#confusion matrix
print(confusion_matrix(y_train, y_pred))

[[256  51]
 [ 52 139]]


In [40]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       307
           1       0.73      0.73      0.73       191

    accuracy                           0.79       498
   macro avg       0.78      0.78      0.78       498
weighted avg       0.79      0.79      0.79       498



In [41]:
#validate
print('Accuracy of KNN classifier on test set: {:.2f}'.format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on test set: 0.71


### 4. Run through steps 2-4 setting k to 10

In [43]:

knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

print('Accuracy of KNN classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on training set: 0.75
[[276  31]
 [ 95  96]]
              precision    recall  f1-score   support

           0       0.74      0.90      0.81       307
           1       0.76      0.50      0.60       191

    accuracy                           0.75       498
   macro avg       0.75      0.70      0.71       498
weighted avg       0.75      0.75      0.73       498

Accuracy of KNN classifier on test set: 0.71


In [44]:
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

print('Accuracy of KNN classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on training set: 0.72
[[269  38]
 [103  88]]
              precision    recall  f1-score   support

           0       0.72      0.88      0.79       307
           1       0.70      0.46      0.56       191

    accuracy                           0.72       498
   macro avg       0.71      0.67      0.67       498
weighted avg       0.71      0.72      0.70       498

Accuracy of KNN classifier on test set: 0.72
