In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from pydataset import data

import acquire
import prepare

In [None]:
df = acquire.get_titanic_data()
df.head()

In [None]:
df = prepare.prep_titanic_data(df)
df.head()

In [None]:
df.shape

In [None]:
df = df.drop_duplicates()
df.shape

In [None]:
df.info()

In [None]:
df.age = df.age.fillna(df.age.mean())

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df = df.drop(columns=['pclass', 'embarked', 'embarked_encode', 'passenger_id'])

In [None]:
df = pd.get_dummies(df, ['sex', 'class', 'embark_town'], drop_first=True)
df.head()

In [None]:
train, validate, test = prepare.split(df, stratify_by='survived')

In [None]:
train.head()

In [None]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [None]:
train.head()

In [None]:

y_train[0:10]

In [None]:
baseline = y_train.mode()
matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

In [None]:
tree1 = DecisionTreeClassifier(max_depth=3, random_state=123)

tree1 = tree1.fit(X_train, y_train)

y_predictions = tree1.predict(X_train)


In [None]:
plt.figure(figsize=(12, 7))
plot_tree(tree1, feature_names=X_train.columns, class_names=['0','1'])
plt.show()


In [None]:
print('Accuracy on training set: {:.2f}'
      .format(tree1.score(X_train, y_train)))

In [None]:
print(classification_report(y_train, y_predictions))

In [None]:
metrics = []
for i in range(1, 25):
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    tree = tree.fit(X_train, y_train)

    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
### Telco

In [2]:
df = acquire.get_telco_data()
df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [None]:
df.info()

In [3]:
df = df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'])


In [None]:
df.head()

In [5]:

df = pd.get_dummies(df, [['gender','tenure', 'monthly_charges']], dummy_na=False, drop_first=True)
df.head()


ValueError: Length of 'prefix' (1) did not match the length of the columns being encoded (17).

In [None]:
train, validate, test = prepare.split(df, stratify_by='churn')

In [None]:
X_train = train.drop(columns=["churn"])
y_train = train.churn

X_validate = validate.drop(columns=["churn"])
y_validate = validate.churn

X_test = test.drop(columns=["churn"])
y_test = test.churn

In [None]:
X_train.head()

In [None]:
y_train[:5]

In [None]:
y_train.value_counts()

In [None]:
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
## titanic random forest

rf = RandomForestClassifier(max_depth=10, 
                            random_state=123)
rf

In [None]:
rf.fit(X_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(X_train)
y_pred

In [None]:
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
# The mode is a great baseline
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
train.head()

In [None]:
metrics = []

for i in range(1, 25):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
y_pred = rf.predict(X_train)
y_pred

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
TN, FP, FN, TP