In [5]:
import pandas as pd
import numpy as np
import plotly_express as px
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
df = pd.read_csv('data/TripleTen user behavior.csv')

features = df.drop(['is_ultra'], axis=1)
target = df['is_ultra']

features_train, features_temp, target_train, target_temp = train_test_split(features, target, test_size=0.4, random_state=54321)
features_test, features_valid, target_test, target_valid = train_test_split(features_temp, target_temp, test_size=0.5, random_state=54321)

#### I will now train three classification models to see which one performs best

In [7]:
# first i will train the model using a decision tree and will create
# a loop to tune the max_depth hyperparameter to find the optimal
# value which will be decided using the accuracy evaluation metric
bestModel = None
bestScore = 0
bestDepth = 0
for depth in range(1, 6):
    model = DecisionTreeClassifier(max_depth=depth, random_state=54321, criterion='gini')
    model.fit(features_train, target_train)
    prediction = model.predict(features_valid)
    accuracy = accuracy_score(target_valid, prediction)
    if accuracy > bestScore:
        bestModel = model
        bestScore = accuracy
        bestDepth = depth
        
print(f"""Accuracy Score of the Best Model: {bestScore}\nValue of Depth Parameter: {bestDepth}""")

Accuracy Score of the Best Model: 0.8164852255054432
Value of Depth Parameter: 3


In [8]:
# random forest model
bestModel = None
bestScore = 0
bestEst = 0
for est in range(1, 16):
    model = RandomForestClassifier(random_state=54321, n_estimators=est)
    model.fit(features_train, target_train)
    prediction = model.predict(features_valid)
    score = accuracy_score(target_valid, prediction)
    if score > bestScore:
        bestScore = score
        bestModel = model
        bestEst = est
print(f"Highest Accuracy for Random Forest Model: {bestScore}\nBest Number of Estimators for Random Forest Model: {bestEst}")

Highest Accuracy for Random Forest Model: 0.8149300155520995
Best Number of Estimators for Random Forest Model: 15


In [12]:
# logistic regression model 
bestModel = None
bestScore = 0
best_hp = 0

model = LogisticRegression(random_state=54321, solver='liblinear')
model.fit(features_train, target_train)
prediction = model.predict(features_valid)
#score_train = accuracy_score(target_train, prediction)
score_valid = accuracy_score(target_valid, prediction)
#print(f"Accuracy of Logistic Regression Model on Training Set: {score_train}")
print(f"Accuracy of Logistic Regression Model on Validation Set: {score_valid}")

Accuracy of Logistic Regression Model on Validation Set: 0.7402799377916018
