In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
df = pd.read_csv("mushrooms.csv")

In [5]:
df = df.astype("category")

In [6]:
le = LabelEncoder()
for column in df.columns:
    df[column] = le.fit_transform(df[column])

In [7]:
X = df.drop(
    ["veil-type", "bruises", "gill-spacing", "gill-size", "stalk-shape", "stalk-root",
     "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring",
     "stalk-color-below-ring", "ring-number", "ring-type", "spore-print-color", "veil-color"],
    axis=1
)

In [8]:
Y = X["class"]
X = X.drop(["class"], axis=1)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.4)


In [17]:
drt = DecisionTreeRegressor(random_state=42)
drt.fit(X_train, y_train)

In [19]:
print(drt.score(X_train, y_train))
print(drt.score(X_test, y_test))

0.9969679698454832
0.9885123748808681


In [23]:
params = {
    'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
    'max_depth': range(5, 50, 1),
    'min_samples_split': range(2, 100, 10)
}

In [12]:
gs = GridSearchCV(DecisionTreeRegressor(random_state=42), params, n_jobs=1, cv=5, scoring='r2')
gs.fit(X_train, y_train)

In [24]:
print("Best R2 Score from GridSearchCV: ", gs.best_score_)
print("Best Parameters: ", gs.best_params_)

Best R2 Score from GridSearchCV:  0.9772508960286842
Best Parameters:  {'max_depth': 13, 'min_impurity_decrease': 0.0001, 'min_samples_split': 2}


In [25]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.9847212193694225
0.9720927330269121


In [13]:
best_params = gs.best_params_
DTR = DecisionTreeRegressor(
    random_state=42,
    min_impurity_decrease=best_params['min_impurity_decrease'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split']
)
DTR.fit(X_train, y_train)

In [14]:
train_score = DTR.score(X_train, y_train)
test_score = DTR.score(X_test, y_test)

In [15]:
print("Train R2 Score (Optimized): ", train_score)
print("Test R2 Score (Optimized): ", test_score)

Train R2 Score (Optimized):  0.9847212193694225
Test R2 Score (Optimized):  0.9720927330269121


In [16]:
y_pred = DTR.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (Test, Optimized): ", mse)


Mean Squared Error (Test, Optimized):  0.006969395061574696


In [17]:
r2 = r2_score(y_test, y_pred)
print("R2 Score (Test, Optimized): ", r2)

R2 Score (Test, Optimized):  0.9720927330269121
