In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from pydataset import data

import acquire
import prepare

In [2]:
df = acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
df = prepare.prep_titanic_data(df)
df.head()

KeyError: "['internet_service_type_id', 'payment_type_id', 'contract_type_id', 'customer_id'] not found in axis"

In [None]:
df.shape

In [None]:
df = df.drop_duplicates()
df.shape

In [None]:
df.info()

In [4]:
df.age = df.age.fillna(df.age.mean())

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df = df.drop(columns=['pclass', 'embarked', 'embarked_encode', 'passenger_id'])

In [5]:
dummy_df = pd.get_dummies(df[['sex','embark_town', 'class']], dummy_na=False, drop_first=[True, True])

In [6]:
df = pd.concat([df, dummy_df], axis=1)
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,1,0,1,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,0,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,0,0,1,0,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,0,0,1,0,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,1,0,1,0,1


In [7]:
train, validate, test = prepare.split(df, stratify_by='survived')

In [8]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third
583,583,0,1,male,36.0,0,0,40.125,C,First,A,Cherbourg,1,1,0,0,0,0
165,165,1,3,male,9.0,0,2,20.525,S,Third,,Southampton,0,1,0,1,0,1
50,50,0,3,male,7.0,4,1,39.6875,S,Third,,Southampton,0,1,0,1,0,1
259,259,1,2,female,50.0,0,1,26.0,S,Second,,Southampton,0,0,0,1,1,0
306,306,1,1,female,29.699118,0,0,110.8833,C,First,,Cherbourg,1,0,0,0,0,0


In [9]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [None]:
train.head()

In [None]:

y_train[0:10]

In [10]:
baseline = y_train.mode()
matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

Baseline accuracy: 0.62


In [11]:
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (498, 18)
validate -> (214, 18)
test -> (179, 18)


In [None]:
tree1 = DecisionTreeClassifier(max_depth=3, random_state=123)

tree1 = tree1.fit(X_train, y_train)

y_predictions = tree1.predict(X_train)


In [None]:
plt.figure(figsize=(12, 7))
plot_tree(tree1, feature_names=X_train.columns, class_names=['0','1'])
plt.show()


In [None]:
print('Accuracy on training set: {:.2f}'
      .format(tree1.score(X_train, y_train)))

In [None]:
print(classification_report(y_train, y_predictions))

In [None]:
metrics = []
for i in range(1, 25):
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    tree = tree.fit(X_train, y_train)

    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
### Telco

In [None]:
df = acquire.get_telco_data()
df.head()

In [None]:
df.info()

In [None]:
df = df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'])


In [None]:
df.head()

In [None]:

df = pd.get_dummies(df, [['gender','tenure', 'monthly_charges']], dummy_na=False, drop_first=True)
df.head()


In [None]:
train, validate, test = prepare.split(df, stratify_by='churn')

In [None]:
X_train = train.drop(columns=["churn"])
y_train = train.churn

X_validate = validate.drop(columns=["churn"])
y_validate = validate.churn

X_test = test.drop(columns=["churn"])
y_test = test.churn

In [None]:
X_train.head()

In [None]:
y_train[:5]

In [None]:
y_train.value_counts()

In [None]:
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [12]:
## titanic random forest

y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [13]:
baseline = y_train.mode()

matches_baseline_prediction = y_train == 0

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline prediction: {baseline[0]}")
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

Baseline prediction: 0
Baseline accuracy: 0.62


In [14]:
forest1 = RandomForestClassifier(max_depth=3, random_state=123)

forest1.fit(X_train, y_train)

y_predictions = forest1.predict(X_train)

report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of depth 3")
pd.DataFrame(report)

ValueError: could not convert string to float: 'male'

In [16]:
pd.DataFrame(confusion_matrix(y_pred, y_train))

NameError: name 'y_pred' is not defined

In [17]:
TN, FP, FN, TP = confusion_matrix(y_train,y_predictions).ravel()
ALL = TP + TN + FP + FN

TP, TN, FP, FN

NameError: name 'y_predictions' is not defined

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

In [None]:
# The mode is a great baseline
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
train.head()