# Normal Random Forest

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = 'Vehicle MPG - 1984 to 2023.csv'
data = pd.read_csv(file_path)

# Selecting specific columns for features and target
features = ['Engine Displacement', 'Drive', 'Transmission', 'Vehicle Class', 'Fuel Type 1', 'Model Year']
target = 'Make'

# Filter the dataset to include only the selected features and the target
data = data[features + [target]]

# Handling categorical variables with Label Encoding
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column].astype(str))
    label_encoders[column] = le

# Separating features and target
X = data.drop(target, axis=1)
y = data[target]

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Training the model
clf.fit(X_train, y_train)

# Predicting the Test set results
y_pred = clf.predict(X_test)

# Evaluating the results
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# If you want to inverse transform the predicted labels back to original labels
# y_pred_labels = label_encoders[target].inverse_transform(y_pred)


Accuracy: 0.5613289760348584
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           2       0.67      0.71      0.69        68
           3       0.73      0.89      0.80        18
           4       0.12      0.20      0.15         5
           5       0.78      0.93      0.85        30
           6       0.79      0.76      0.78       242
           8       0.00      0.00      0.00         2
          10       1.00      1.00      1.00         1
          11       0.84      0.87      0.85       473
          12       1.00      1.00      1.00         1
          14       0.91      0.86      0.89        36
          15       0.50      1.00      0.67         1
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00         1
          18       1.00      1.00      1.00         4
          19       0.29      0.26      0.27       144
          20       0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# OverSampling Random Forest

In [13]:
!{sys.executable} -m pip install imbalanced-learn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
file_path = 'Vehicle MPG - 1984 to 2023.csv'
data = pd.read_csv(file_path)

# Selecting specific columns for features and target
features = ['Engine Displacement', 'Drive', 'Transmission', 'Vehicle Class', 'Fuel Type 1', 'Model Year']
target = 'Make'
data = data[features + [target]]

# Handling categorical variables with Label Encoding
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column].astype(str))
    label_encoders[column] = le

# Imputing missing values
imputer = SimpleImputer(strategy='mean')
data[features] = imputer.fit_transform(data[features])

# Separating features and target
X = data.drop(target, axis=1)
y = data[target]

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying oversampling
oversampler = RandomOverSampler(random_state=42)
X_train_os, y_train_os = oversampler.fit_resample(X_train, y_train)

# Creating and training the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_os, y_train_os)

# Predicting the Test set results
y_pred = clf.predict(X_test)

# Evaluating the results
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


zsh:1: parse error near `-m'
Accuracy: 0.55
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           2       0.52      0.69      0.59        68
           3       0.67      0.89      0.76        18
           4       0.33      1.00      0.50         5
           5       0.75      0.90      0.82        30
           6       0.78      0.73      0.76       242
           8       0.00      0.00      0.00         2
          10       1.00      1.00      1.00         1
          11       0.88      0.78      0.83       473
          12       1.00      1.00      1.00         1
          14       0.69      0.86      0.77        36
          15       0.50      1.00      0.67         1
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00         1
          18       1.00      1.00      1.00         4
          19       0.22      0.22      0.22       144
          20 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# UnderSampling Random Forest

In [14]:
!pip3 install imbalanced-learn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import RandomUnderSampler

# Load the dataset
file_path = 'Vehicle MPG - 1984 to 2023.csv'
data = pd.read_csv(file_path)

# Selecting specific columns for features and target
features = ['Engine Displacement', 'Drive', 'Transmission', 'Vehicle Class', 'Fuel Type 1', 'Model Year']
target = 'Make'
data = data[features + [target]]

# Handling categorical variables with Label Encoding
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column].astype(str))
    label_encoders[column] = le

# Imputing missing values
imputer = SimpleImputer(strategy='mean')
data[features] = imputer.fit_transform(data[features])

# Separating features and target
X = data.drop(target, axis=1)
y = data[target]

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying undersampling
undersampler = RandomUnderSampler(random_state=42)
X_train_us, y_train_us = undersampler.fit_resample(X_train, y_train)

# Creating and training the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_us, y_train_us)

# Predicting the Test set results
y_pred = clf.predict(X_test)

# Evaluating the results
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.08660130718954248
Classification Report:
               precision    recall  f1-score   support

           0       0.01      0.25      0.02         4
           1       0.00      0.00      0.00         0
           2       0.04      0.12      0.06        68
           3       0.09      0.44      0.14        18
           4       0.01      0.60      0.02         5
           5       0.32      0.40      0.36        30
           6       0.14      0.06      0.09       242
           7       0.00      0.00      0.00         0
           8       0.10      0.50      0.17         2
           9       0.00      0.00      0.00         0
          10       0.09      1.00      0.17         1
          11       0.40      0.05      0.09       473
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         0
          14       0.10      0.28      0.15        36
          15       0.03      1.00      0.06         1
          16       0.04    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
