In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
file_path = 'Vehicle MPG - 1984 to 2023.csv'
data = pd.read_csv(file_path)

# Handle missing values: dropping rows where specific required columns are missing
required_columns = ['Engine Displacement', 'Drive', 'Transmission', 'Vehicle Class', 
                    'Fuel Type 1', 'Model Year']
data_clean = data.dropna(subset=required_columns)

# Encode categorical variables
categorical_features = ['Fuel Type 1', 'Drive', 'Transmission', 'Vehicle Class']
data_clean = pd.get_dummies(data_clean, columns=categorical_features, drop_first=True)

# Label encoding for 'Make'
le = LabelEncoder()
data_clean['Make'] = le.fit_transform(data_clean['Make'])

# Selecting a subset of features for modeling
features = data_clean[['Engine Displacement', 'Model Year']]
features = pd.concat([features, data_clean.filter(like='Fuel Type 1_'), 
                      data_clean.filter(like='Drive_'), data_clean.filter(like='Transmission_'), 
                      data_clean.filter(like='Vehicle Class_')], axis=1)
target = data_clean['Make']

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=42)

# Initialize and fit the Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Checking the initial performance
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

train_score, test_score


(0.7427336900545131, 0.5504068716094033)

In [8]:
# Pruning the Decision Tree model to avoid overfitting
# We'll limit the maximum depth of the tree and set a minimum number of samples required to be at a leaf node

pruned_model = DecisionTreeClassifier(max_depth=10, min_samples_leaf=50, random_state=42)
pruned_model.fit(X_train, y_train)

# Evaluating the pruned model
pruned_train_score = pruned_model.score(X_train, y_train)
pruned_test_score = pruned_model.score(X_test, y_test)

pruned_train_score, pruned_test_score

(0.3745680049094022, 0.35644310474755087)