In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
dfo = pd.read_csv('/Users/lakshitgupta/Library/CloudStorage/OneDrive-SeattleUniversity/Quater3/Machine Learning-2/Written Homeworks/youth_data.csv')
print(dfo.head())

In [None]:
substance_cols = [
    'iralcfy', 'irmjfy', 'ircigfm', 'IRSMKLSS30N', 'iralcfm', 'irmjfm',
    'ircigage', 'irsmklsstry', 'iralcage', 'irmjage',
    'mrjflag', 'alcflag', 'tobflag',
    'alcydays', 'mrjydays', 'alcmdays', 'mrjmdays', 'cigmdays', 'smklsmdays'
]

demographic_cols = [
    'irsex', 'NEWRACE2', 'HEALTH2', 'eduschlgo', 'EDUSCHGRD2',
    'eduskpcom', 'imother', 'ifather', 'income', 'govtprog',
    'POVERTY3', 'PDEN10', 'COUTYP4'
]

# Load data and select columns of interest
df_youth = dfo.loc[:, 'schfelt':'rlgfrnd']  
df_substance = dfo[substance_cols]
df_demog = dfo[demographic_cols]

# Combine into one DataFrame
df = pd.concat([df_substance, df_youth, df_demog], axis=1)

# Fix metadata

# Define unordered and ordered factor columns
unordered_factor_cols = (
    list(df_youth.columns) +
    ['mrjflag', 'alcflag', 'tobflag'] +
    ['irsex', 'NEWRACE2', 'eduschlgo', 'imother', 'ifather', 'govtprog', 'PDEN10', 'COUTYP4']
)
ordered_factor_cols = ['EDUSCHGRD2', 'HEALTH2', 'POVERTY3', 'income']

# Convert to factors
df[unordered_factor_cols] = df[unordered_factor_cols].astype('category')  # Unordered factors
for col in ordered_factor_cols:
    df[col] = pd.Categorical(df[col], ordered=True)  # Ordered factors

# Define variable labels as a dictionary
variable_labels = {
    'iralcfy': 'Alcohol frequency past year',
    'irmjfy': 'Marijuana frequency past year',
    # and so on...
}

# Assign labels to DataFrame columns
df.columns = pd.Index(variable_labels.get(col, col) for col in df.columns)

# Note: You can access the labeled data using df.columns

youth_experience_cols = df_youth.columns
df.info()

In [None]:
missing_values = df.isna().sum().sum()
print(f"Number of missing values: {missing_values}")

In [None]:
youth_experience_cols

In [None]:
df_substance.columns

In [None]:
df_demog.columns

In [None]:
df = df.dropna()
missing_values = df.isna().sum().sum()
print(f"Number of missing values after dropping: {missing_values}")

In [None]:
df = df[~df['EDUSCHGRD2'].isin([98, 99])]

# Filter out rows where eduskpcom is 94, 97, 98, or 99
df = df[~df['eduskpcom'].isin([94, 97, 98, 99])]

# Filter out rows where imother is 3 or 4
df = df[~df['imother'].isin([3, 4])]

# Filter out rows where ifather is 3 or 4
df = df[~df['ifather'].isin([3, 4])]

# Filter out rows where PDEN10 is 3
df = df[df['PDEN10'] != 3]

print(df.shape)

## Binary Classification
## Alcohol - 'alcflag' 
## Alcohol ever used (0=never, 1=ever)

In [None]:
df_youthExp = df[youth_experience_cols]
df_demographic = df[demographic_cols]
ALCFLAG = df[['alcflag']]

# Combining into a new DataFrame
df_New = pd.concat([df_youthExp, df_demographic, ALCFLAG], axis=1)

print(df_New)

In [None]:
X = df_New.drop(columns=['alcflag'])
y = df_New['alcflag'] 

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


print("Training set dimensions:", X_train.shape)
print("Test set dimensions:", X_test.shape)

## Decision Tree

In [None]:
tree_model = DecisionTreeClassifier(random_state=1)
tree_model.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(70, 100))

# Plotting the decision tree
plot_tree(tree_model,
          filled=True,
          feature_names=X_train.columns.tolist(),
          class_names=['No', 'Yes'],
          label='all',
          fontsize=24)

plt.show()

In [None]:
# Getting feature importances
feature_importance = tree_model.feature_importances_


feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sortting the DataFrame by importance values in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Top 10 features
top_10_features = feature_importance_df.head(10)

# Display the top 15 features
print("Top 15 Feature Importance:")
print(top_10_features)


In [None]:
tree_pred = tree_model.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, tree_pred)

# Accuracy
Decaccuracy = accuracy_score(y_test, tree_pred)

print("Confusion Matrix:")
print(cm)

print("Accuracy:", Decaccuracy)

## Pruning

In [None]:
# Defining the parameter grid
param_grid = {'max_leaf_nodes': range(2,20)}

# Create GridSearchCV 
grid_search = GridSearchCV(estimator=tree_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best estimator
best_tree_model = grid_search.best_estimator_

pruned_tree_pred = best_tree_model.predict(X_test)

# Confusion matrix
pruned_cm = confusion_matrix(y_test, pruned_tree_pred)

# Accuracy
pruned_accuracy = accuracy_score(y_test, pruned_tree_pred)

print("max_leaf_nodes:", best_tree_model.max_leaf_nodes)
print("Confusion Matrix after Pruning:")
print(pruned_cm)
print("Accuracy after Pruning:", pruned_accuracy)

In [None]:
feature_names_list = X_train.columns.tolist()

plt.figure(figsize=(20, 10))

# Plotting the pruned decision tree
plot_tree(best_tree_model, filled=True, feature_names=feature_names_list, class_names=["No", "Yes"], fontsize=10)

plt.show()

In [None]:
max_leaf_nodes_range = range(2, 20)
accuracies = []
num_leaf_nodes = []

for leaf_nodes in max_leaf_nodes_range:
    tree_model = DecisionTreeClassifier(max_leaf_nodes=leaf_nodes, random_state=1)
    
    accuracy = np.mean(cross_val_score(tree_model, X_train, y_train, cv=5, scoring='accuracy'))
    
    accuracies.append(accuracy)
    num_leaf_nodes.append(leaf_nodes)

Prunmax_accuracy = max(accuracies)
corresponding_num_leaf_nodes = num_leaf_nodes[accuracies.index(Prunmax_accuracy)]

print('Maximum Accuracy:', Prunmax_accuracy)
print('Corresponding Number of Leaf Nodes:', corresponding_num_leaf_nodes)

plt.figure(figsize=(10, 6))
plt.plot(num_leaf_nodes, accuracies, marker='o', linestyle='-')
plt.xlabel('Number of Leaf Nodes')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Number of Leaf Nodes')
plt.grid(True)
plt.show()

In [None]:
feature_importances = best_tree_model.feature_importances_

# Importances along with feature names
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

top_features = feature_importance_df.head(10)  
# Display the top features
print("Top Features:")
print(top_features)

## Bagging

In [None]:
bag_model = RandomForestClassifier(n_estimators=100, max_features=60, random_state=1)

# Fitting the Random Forest classifier
bag_model.fit(X_train, y_train)

yhat_bag = bag_model.predict(X_test)

# Calculate accuracy
Bagaccuracy = accuracy_score(y_test, yhat_bag)
print('\nAccuracy:', Bagaccuracy)




# Importances from the Random Forest model
feature_importances_bag = bag_model.feature_importances_

feature_importance_bag_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances_bag})

feature_importance_bag_df = feature_importance_bag_df.sort_values(by='Importance', ascending=False)

top_features_bag = feature_importance_bag_df.head(10)

plt.figure(figsize=(10, 6))
plt.barh(top_features_bag['Feature'], top_features_bag['Importance'], color='blue')
plt.xlabel('Feature Importance')
plt.ylabel('Variables')
plt.title('Variable Importance for Consumption of Alcohol')
plt.gca().invert_yaxis()
plt.show()

## Random Forest

In [None]:
np.random.seed(123)

# Initialize the Random Forest classifier
random_model = RandomForestClassifier(n_estimators=500, max_features=30, random_state=1)

# Fit the Random Forest classifier to the training data
random_model.fit(X_train, y_train)

# Make predictions on the test set
yhat_rf = random_model.predict(X_test)

# Calculate accuracy
Ranaccuracy = accuracy_score(y_test, yhat_rf)
print('\nAccuracy:', Ranaccuracy)

In [None]:
feature_importance = random_model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})


feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

top_10_features = feature_importance_df.head(10)

print("Top 10 Feature Importance:")
print(top_10_features)

In [None]:
mtry_values = [1, 25, 45, 55, 60]
accuracy_values = []

for mtry in mtry_values:
    # Initializing and fit Random Forest with current mtry value
    rf_model = RandomForestClassifier(n_estimators=500, max_features=mtry, random_state=1)
    rf_model.fit(X_train, y_train)

    yhat_test = rf_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, yhat_test)
    accuracy_values.append(accuracy)

print('Accuracy Values:', accuracy_values)

# Finding the index of the highest accuracy value
max_accuracy_index = accuracy_values.index(max(accuracy_values))
Ranhighest_accuracy = accuracy_values[max_accuracy_index]
corresponding_mtry = mtry_values[max_accuracy_index]

print('Highest Accuracy:', Ranhighest_accuracy)
print('Corresponding mtry:', corresponding_mtry)

plt.figure(figsize=(10, 6))
plt.plot(mtry_values, accuracy_values, marker='o', linestyle='-')
plt.xlabel('mtry')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. mtry')
plt.grid(True)
plt.show()

## Boosting

In [None]:
vals = np.arange(0.1, 0.21, 0.02)

# Gradient Boosting model with different shrinkage values
gbm_models = []
accuracy_values = []

for lam1 in vals:
    boosting_model = GradientBoostingClassifier(n_estimators=1000, learning_rate=lam1, random_state=1)
    boosting_model.fit(X_train, y_train)
    yhat_test = boosting_model.predict(X_test)
    accuracy = accuracy_score(y_test, yhat_test)
    gbm_models.append(boosting_model)
    accuracy_values.append(accuracy)
  
# Test set accuracy vs. learning rate
plt.plot(vals, accuracy_values, marker='o', linestyle='-')
plt.xlabel('learning_rate')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. learning_rate')
plt.show()

# Finding the highest accuracy and its corresponding learning rate
max_accuracy_index = np.argmax(accuracy_values)
Boosthighest_accuracy = accuracy_values[max_accuracy_index]
corresponding_learning_rate = vals[max_accuracy_index]

print("Highest Accuracy:", Boosthighest_accuracy)
print("Corresponding Learning Rate:", corresponding_learning_rate)

In [None]:
feature_importances = []

for model in gbm_models:
    # Feature importances from Boosting
    importances = model.feature_importances_
    feature_importances.append(importances)


average_importances = np.mean(feature_importances, axis=0)

importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': average_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

top_10_features = importance_df.head(10)

print(top_10_features)

In [None]:
accuracy_data = {
    "Model": ["Decision Tree", "Pruned Decision Tree", "Bagging", "Random Forest", "Boosting"],
    "Accuracy": [Decaccuracy, Prunmax_accuracy, Bagaccuracy, Ranhighest_accuracy, Boosthighest_accuracy]
}

# Create a DataFrame from the accuracy data
accuracy_df = pd.DataFrame(accuracy_data)

# Set the style for the DataFrame
styled_accuracy_df = accuracy_df.style.hide_index().set_caption("Model Accuracy")

# Apply formatting to the accuracy values
styled_accuracy_df = styled_accuracy_df.format({"Accuracy": "{:.2%}"})

# Display the styled accuracy DataFrame
styled_accuracy_df

## Multi Class Classification
## Marijuana - 'mrjmdays'
## Number of days of marijuana in past month (1-4 categories, 5=none)

In [None]:
df_youthExp1 = df[youth_experience_cols]
df_demographic1 = df[demographic_cols]
MRJMDAYS = df[['mrjmdays']]

df_Multi = pd.concat([df_youthExp1, df_demographic1, MRJMDAYS], axis=1)

In [None]:
X = df_Multi.drop(columns=['mrjmdays']) 
y = df_Multi['mrjmdays']  

# Training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

## Decision Tree

In [None]:
tree_modelNew = DecisionTreeClassifier(random_state=1)
tree_modelNew.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(70, 100))


plot_tree(tree_modelNew,
          filled=True,
          feature_names=X_train.columns.tolist(),
          label='all',
          fontsize=24)


plt.show()

In [None]:
feature_importance = tree_modelNew.feature_importances_

# Feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

top_10_features = feature_importance_df.head(10)

# Displaying the top 10 features
print("Top 10 Feature Importance:")
print(top_10_features)


In [None]:
tree_predNew = tree_modelNew.predict(X_test)

cm = confusion_matrix(y_test, tree_predNew)
Decaccuracy1 = accuracy_score(y_test, tree_predNew)

print("Confusion Matrix:")
print(cm)

print("Accuracy:", Decaccuracy1)

## Pruning

In [None]:
# Defining the parameter grid
param_grid = {'max_leaf_nodes': range(2,20)}

grid_search = GridSearchCV(estimator=tree_modelNew, param_grid=param_grid, cv=5,scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

best_tree_modelNew = grid_search.best_estimator_

pruned_tree_predNew = best_tree_modelNew.predict(X_test)
pruned_cmNew = confusion_matrix(y_test, pruned_tree_predNew)
pruned_accuracy_new = accuracy_score(y_test, pruned_tree_predNew)

print("max_leaf_nodes:", best_tree_modelNew.max_leaf_nodes)
print("Confusion Matrix after Pruning:")
print(pruned_cmNew)
print("Accuracy after Pruning:", pruned_accuracy_new)


In [None]:
feature_names_listNew = X_train.columns.tolist()

plt.figure(figsize=(20, 10))

# Pruned decision tree
plot_tree(best_tree_modelNew, filled=True, feature_names=feature_names_listNew, fontsize=10)

plt.show()

In [None]:
max_leaf_nodes_range = range(2, 20)

accuracies = []
num_leaf_nodes = []

for leaf_nodes in max_leaf_nodes_range:
    tree_modelNew = DecisionTreeClassifier(max_leaf_nodes=leaf_nodes, random_state=1)
    
    accuracy = np.mean(cross_val_score(tree_modelNew, X_train, y_train, cv=5, scoring='accuracy'))
    
    accuracies.append(accuracy)
    num_leaf_nodes.append(leaf_nodes)

Prunmax_accuracy1 = max(accuracies)
corresponding_num_leaf_nodes = num_leaf_nodes[accuracies.index(Prunmax_accuracy1)]

print('Maximum Accuracy:', Prunmax_accuracy1)
print('Corresponding Number of Leaf Nodes:', corresponding_num_leaf_nodes)

plt.figure(figsize=(10, 6))
plt.plot(num_leaf_nodes, accuracies, marker='o', linestyle='-')
plt.xlabel('Number of Leaf Nodes')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Number of Leaf Nodes')
plt.grid(True)
plt.show()


In [None]:
feature_importances = best_tree_modelNew.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})


feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Top 5 features
top_features = feature_importance_df.head(5)  

print("Top Features:")
print(top_features)

## Bagging

In [None]:
bag_model = RandomForestClassifier(n_estimators=100, max_features=60, random_state=1)
bag_model.fit(X_train, y_train)

yhat_bag = bag_model.predict(X_test)

Bagaccuracy1 = accuracy_score(y_test, yhat_bag)
print('\nAccuracy:', Bagaccuracy1)

# Feature importances from the Random Forest model
feature_importances_bag = bag_model.feature_importances_

feature_importance_bag_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances_bag})

feature_importance_bag_df = feature_importance_bag_df.sort_values(by='Importance', ascending=False)

# Top 10 variables
top_features_bag = feature_importance_bag_df.head(10)

plt.figure(figsize=(10, 6))
plt.barh(top_features_bag['Feature'], top_features_bag['Importance'], color='blue')
plt.xlabel('Feature Importance')
plt.ylabel('Variables')
plt.title('Variable Importance for Consumption of marijuana in past month')
plt.gca().invert_yaxis()
plt.show()

## Random Forest

In [None]:
np.random.seed(123)

random_modelNew = RandomForestClassifier(n_estimators=500, max_features=30, random_state=1)

# Fit the Random Forest classifier to the training data
random_modelNew.fit(X_train, y_train)

yhat_rf = random_modelNew.predict(X_test)

Ranaccuracy1 = accuracy_score(y_test, yhat_rf)
print('\nAccuracy:', Ranaccuracy1)

In [None]:
# Feature importances
feature_importance = random_modelNew.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Top 10 features
top_10_features = feature_importance_df.head(10)

print("Top 10 Feature Importance:")
print(top_10_features)

In [None]:
mtry_values = [1, 25, 45, 55, 60]
accuracy_values = []

for mtry in mtry_values:
    rf_model = RandomForestClassifier(n_estimators=500, max_features=mtry, random_state=1)
    rf_model.fit(X_train, y_train)
    
    yhat_test = rf_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, yhat_test)
    accuracy_values.append(accuracy)

# Finding the highest accuracy and its corresponding mtry value
Ranmax_accuracy1 = max(accuracy_values)
corresponding_mtry = mtry_values[accuracy_values.index(Ranmax_accuracy1)]

print('Highest Accuracy:', Ranmax_accuracy1)
print('Corresponding mtry:', corresponding_mtry)

# Plot Accuracy vs. mtry
plt.figure(figsize=(10, 6))
plt.plot(mtry_values, accuracy_values, marker='o', linestyle='-')
plt.xlabel('mtry')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. mtry')
plt.grid(True)
plt.show()

## Boosting

In [None]:
vals = np.arange(0.1, 0.21, 0.01)

# Gradient Boosting models with different shrinkage values
gbm_models = []
accuracy_values = []

for lam1 in vals:
    boosting_model = GradientBoostingClassifier(n_estimators=1000, learning_rate=lam1, random_state=1)
    boosting_model.fit(X_train, y_train)
    yhat_test = boosting_model.predict(X_test)
    accuracy = accuracy_score(y_test, yhat_test)
    gbm_models.append(boosting_model)
    accuracy_values.append(accuracy)
  
# Test set accuracy vs. learning rate
plt.plot(vals, accuracy_values, marker='o', linestyle='-')
plt.xlabel('learning_rate')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. learning_rate')
plt.grid(True)
plt.show()

# Finding the highest accuracy and its corresponding learning rate
Boostmax_accuracy1 = max(accuracy_values)
corresponding_learning_rate = vals[accuracy_values.index(Boostmax_accuracy1)]

print('Highest Accuracy:', Boostmax_accuracy1)
print('Corresponding Learning Rate:', corresponding_learning_rate)

In [None]:
best_boosting_model = GradientBoostingClassifier(n_estimators=1000, learning_rate=corresponding_learning_rate, random_state=1)
best_boosting_model.fit(X_train, y_train)

# Access feature importances
feature_importance = best_boosting_model.feature_importances_

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 10 features
top_10_features = feature_importance_df.head(10)

# Display the top 10 features
print("Top 10 Feature Importance:")
print(top_10_features)

In [None]:
accuracy_data = {
    "Model": ["Decision Tree", "Pruned Decision Tree", "Bagging", "Random Forest", "Boosting"],
    "Accuracy": [Decaccuracy1, Prunmax_accuracy1, Bagaccuracy, Ranmax_accuracy1, Boostmax_accuracy1]
}

# Create a DataFrame from the accuracy data
accuracy_df = pd.DataFrame(accuracy_data)

# Set the style for the DataFrame
styled_accuracy_df = accuracy_df.style.hide_index().set_caption("Model Accuracy")

# Apply formatting to the accuracy values
styled_accuracy_df = styled_accuracy_df.format({"Accuracy": "{:.2%}"})

# Display the styled accuracy DataFrame
styled_accuracy_df

## Regression
## Tobacco - 'irsmklsstry'
## Smokeless tobacco age of first use (1-70), 991=never used

In [None]:
df_youthExp = df[youth_experience_cols]
df_demographic = df[demographic_cols]
IRSMKLSSTRY= df[['irsmklsstry']]

df_Reg = pd.concat([df_youthExp, df_demographic, IRSMKLSSTRY], axis=1)
df_Reg = df_Reg[df_Reg['irsmklsstry'] != 991]

print(df_Reg)

In [None]:
X = df_Reg.drop(columns=['irsmklsstry'])
y = df_Reg['irsmklsstry']  

# Training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

print("Training set dimensions:", X_train.shape)
print("Test set dimensions:", X_test.shape)

## Decision Tree

In [None]:
Reg_model = DecisionTreeRegressor(random_state=1)

# Fit the model to the training data
Reg_model.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(70, 100))

plot_tree(Reg_model,
          filled=True,
          feature_names=X_train.columns.tolist(),
          fontsize=24)


plt.show()

In [None]:
feature_importance = Reg_model.feature_importances_

# Feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Top 10 features
top_10_features = feature_importance_df.head(10)

print("Top 10 Feature Importance:")
print(top_10_features)

In [None]:
Reg_pred_new = Reg_model.predict(X_test)

# Test error rate
MSE = ((y_test - Reg_pred_new)**2).mean()
print("Test Error Rate:", MSE)




## Pruning

In [None]:
param_grid = {'max_leaf_nodes': range(2, 20)}


grid_search = GridSearchCV(estimator=Reg_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

best_tree_model = grid_search.best_estimator_

# Best max_leaf_nodes value
print("Best max_leaf_nodes value:", best_tree_model.max_leaf_nodes)

In [None]:
max_leaf_nodes_range = range(2, 20)

num_trees = []
mse_values = []

for max_leaf_nodes in max_leaf_nodes_range:
    
    regressor = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    mse_scores = -cross_val_score(regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    
    avg_mse = mse_scores.mean()

    num_trees.append(max_leaf_nodes)
    mse_values.append(avg_mse)
    
prunmin_mse = min(mse_values)
corresponding_num_trees = num_trees[mse_values.index(prunmin_mse)]

print('Minimum MSE:', prunmin_mse)
print('Corresponding Number of Trees (max_leaf_nodes):', corresponding_num_trees)

# MSE vs number of trees
plt.figure(figsize=(10, 6))
plt.plot(num_trees, mse_values, marker='o')
plt.title('MSE vs Number of Trees')
plt.xlabel('Number of Trees (max_leaf_nodes)')
plt.ylabel('Mean Squared Error (MSE)')
plt.grid(True)
plt.show()

In [None]:
feature_names_listNew = X_train.columns.tolist()

plt.figure(figsize=(20, 10))

plot_tree(best_tree_model, filled=True, feature_names=feature_names_listNew, fontsize=10)

plt.show()

## Bagging

In [None]:
bag_modelReg = RandomForestRegressor(n_estimators=100, max_features=60, random_state=1)

bag_modelReg.fit(X_train, y_train)

yhat_bagReg = bag_modelReg.predict(X_test)

# Calculating the test MSE
Bagtest_mse = mean_squared_error(y_test, yhat_bagReg)
print('\nTest MSE:', Bagtest_mse)

feature_importances_bagReg = bag_modelReg.feature_importances_

feature_importance_bagReg_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances_bagReg})

feature_importance_bagReg_df = feature_importance_bagReg_df.sort_values(by='Importance', ascending=False)

# Select the top 10 variables
top_features_bagReg = feature_importance_bagReg_df.head(10)

# Plot variable importance of different variables
plt.figure(figsize=(10, 6))
plt.barh(top_features_bagReg['Feature'], top_features_bagReg['Importance'], color='blue')
plt.xlabel('Feature Importance')
plt.ylabel('Variables')
plt.title('Variable Importance for Regression')
plt.gca().invert_yaxis()
plt.show()


## Random Forest

In [None]:
random_model = RandomForestRegressor(n_estimators=500, max_features=30, random_state=1)

random_model.fit(X_train, y_train)

# Test set
yhat_rf = random_model.predict(X_test)

# Test MSE
test_mse = mean_squared_error(y_test, yhat_rf)
print('\nTest MSE:', test_mse)

In [None]:
# Feature importances from the Random Forest model
feature_importance = random_model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Top 10 features
top_10_features = feature_importance_df.head(10)

feature_name_mapping = {
    'EDUSCHGRD2': 'Grade',
    'talkprob': 'Talks_about_problems',
    'eduskpcom': 'Skipped_school_days',
    'DRPRVME3': 'Seen_prevention_message',
    'FRDADLY2': 'Friends_view_on_drinking',
    'NEWRACE2': 'Race/Hispanicity',
    'YOATTAK2': 'Attacked_with_intent',
    'rlgattd': 'Religious_service_attendance',
    'HEALTH2': 'Health_status',
    'COUTYP4': 'County_type'
}
 
# Convert the original feature names to their descriptive equivalents
readable_features = [feature_name_mapping[f] for f in top_10_features['Feature']]

plt.figure(figsize=(10, 6))
plt.barh(readable_features, top_10_features['Importance'], color='blue')
plt.xlabel('Feature Importance')
plt.ylabel('Variables')
plt.title('Variable Importance for Regression')
plt.gca().invert_yaxis()
plt.show()

In [None]:
mtry_values = [1, 25, 45, 55, 60]
error_rates = []

for mtry in mtry_values:
    rf_model = RandomForestRegressor(n_estimators=500, max_features=mtry, random_state=1)
    rf_model.fit(X_train, y_train)
    
    
    yhat_test = rf_model.predict(X_test)
    test_mse = mean_squared_error(y_test, yhat_test)
    error_rates.append(test_mse)

# Error rates
print('Error Rates:', error_rates)

sorted_error_rates = sorted(error_rates)
second_min_mse = sorted_error_rates[1]

# Find the corresponding mtry value for the second smallest MSE
index_second_min_mse = error_rates.index(second_min_mse)
corresponding_mtry_second_min_mse = mtry_values[index_second_min_mse]

print('Second Smallest MSE:', second_min_mse)
print('Corresponding mtry value for the Second Smallest MSE:', corresponding_mtry_second_min_mse)

# Error rate vs. mtry
plt.figure(figsize=(10, 6))
plt.plot(mtry_values, error_rates, marker='o', linestyle='-')
plt.xlabel('mtry')
plt.ylabel('Test MSE')
plt.title('Test MSE vs. mtry')
plt.show()

## Boosting

In [None]:
vals = np.arange(0.1, 0.21, 0.01)

# Training Gradient Boosting model with different shrinkage values
boosting_models = []
test_error_rates = []

for lam1 in vals:
    boosting_model = GradientBoostingRegressor(n_estimators=1000, learning_rate=lam1, random_state=1)
    boosting_model.fit(X_train, y_train)
    yhat_test = boosting_model.predict(X_test)
    test_mse = mean_squared_error(y_test, yhat_test)
    
    boosting_models.append(boosting_model)
    test_error_rates.append(test_mse)
    
Boostmin_test_mse = min(test_error_rates)
corresponding_learning_rate = vals[test_error_rates.index(Boostmin_test_mse)]

print('Minimum Test MSE:', Boostmin_test_mse)
print('Corresponding Learning Rate:', corresponding_learning_rate)

# Error rates vs. learning rate
plt.plot(vals, test_error_rates, marker='o', linestyle='-')
plt.xlabel('learning_rate')
plt.ylabel('Test MSE')
plt.title('Test MSE vs. learning_rate')
plt.show()

In [None]:
best_learning_rate = vals[np.argmin(test_error_rates)]
best_boosting_model = GradientBoostingRegressor(n_estimators=1000, learning_rate=best_learning_rate, random_state=1)
best_boosting_model.fit(X_train, y_train)

feature_importance = best_boosting_model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Top 10 features
top_10_features = feature_importance_df.head(10)

print("Top 10 Feature Importance:")
print(top_10_features)

In [None]:
mse_data = {
    "Model": ["Decision Tree", "Pruned Decision Tree", "Bagging", "Random Forest", "Boosting"],
    "Test MSE": [MSE, prunmin_mse, Bagtest_mse, second_min_mse, Boostmin_test_mse]
}

mse_df = pd.DataFrame(mse_data)
styled_mse_df = mse_df.style.hide_index().set_caption("Model MSE")
styled_mse_df
