In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv('CO2 Emissions_Canada.csv')
data.columns = data.columns.str.replace(' ', '_')

# Rename columns
renamed_columns = {
    'Make': 'make',
    'Model': 'model',
    'Vehicle_Class': 'vehicle_class',
    'Engine_Size(L)': 'engine_size',
    'Cylinders': 'cylinders',
    'Transmission': 'transmission',
    'Fuel_Type': 'fuel_type',
    'Fuel_Consumption_City_(L/100_km)': 'fuel_cons_city',
    'Fuel_Consumption_Hwy_(L/100_km)': 'fuel_cons_hwy',
    'Fuel_Consumption_Comb_(L/100_km)': 'fuel_cons_comb',
    'Fuel_Consumption_Comb_(mpg)': 'mpg_fuel_cons_comb',
    'CO2_Emissions(g/km)': 'co2'
}

data.rename(columns=renamed_columns, inplace=True)

# Data Exploration
print(data.info())
data.describe().T

# Handling Missing Values
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

# Check duplicated data.
data.duplicated().sum()
# So we have 1103 duplicated data. 
print(f"Dataset length is {len(data)} rows with {data.duplicated().sum()} duplicated rows")
# In this dataset, I can see each row already represents its value well. So, it won't be necessary to have any duplicated data.
# Drop all duplicates data
data.drop_duplicates(inplace=True)

# Check its value on every column
for column in data.columns:
    print(f"{column}:\n{data[column].unique()}\n")
# I am not seeing any weird values from each column. What I mean with 'weird' is like the data have string inside numerical values and vice versa, or words that do not represent our columns.

# One-hot encode categorical columns
make_dummies = pd.get_dummies(data['make'], prefix='Make', drop_first=True)
model_dummies = pd.get_dummies(data['model'], prefix='Model', drop_first=True)
vehicle_class_dummies = pd.get_dummies(data['vehicle_class'], prefix='Vehicle_Class', drop_first=True)
transmission_dummies = pd.get_dummies(data['transmission'], prefix='Transmission', drop_first=True)
fuel_type_dummies = pd.get_dummies(data['fuel_type'], prefix='Fuel_Type', drop_first=True)
# Concatenate dummy variables back to the original DataFrame
data = pd.concat([data, make_dummies, model_dummies, vehicle_class_dummies, transmission_dummies, fuel_type_dummies], axis=1)
data.head()

In [None]:
# Selecting features and target variable
X = data.drop(['co2', 'make', 'model', 'vehicle_class', 'transmission', 'fuel_type'], axis=1)
y = data['co2']

# Convert X and y to numeric types
X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the linear regression model using scikit-learn
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

In [None]:
# Correlation matrix
# Drop non-numeric columns for simplicity
numeric_data = data.select_dtypes(include=[np.number])
# Filling missing values with mean 
numeric_data.fillna(numeric_data.mean(), inplace=True)
# Check if each column has correlations or not to each other
numeric_corr = numeric_data.corr()
plt.figure(figsize=(9, 9))
sns.heatmap(numeric_corr, cbar=True, square=True, annot=True, fmt='.2f', cmap='Blues')
plt.title('Correlation Heatmap')
plt.show()
# Every column has a correlation with CO2_emissions. The difference only Fuel_Consumption_Comb_(mpg) is negatively correlated, and the rest of it is positively correlated.
# But, Fuel_Consumption_Comb_(mpg) and Fuel_Consumption_Comb_(L/100_km) are actually the same thing besides their unit. So, we are just going to use the higher one: Fuel_Consumption_Comb_(L/100_km)

In [None]:
# Group data by categories and calculate mean CO2 emissions
make_co2_mean = data.groupby('make')['co2'].mean()
vehicle_class_co2_mean = data.groupby('vehicle_class')['co2'].mean()
fuel_type_co2_mean = data.groupby('fuel_type')['co2'].mean()

# Plotting the Mean CO2 Emissions by Make
plt.figure(figsize=(12, 6))
sns.barplot(x=make_co2_mean.sort_values().index, y=make_co2_mean.sort_values().values)
plt.title('Mean CO2 Emissions by Make')
plt.xlabel('Make')
plt.ylabel('Mean CO2 Emissions')
plt.xticks(rotation=90)
plt.show()

# Plotting the Mean CO2 Emissions by Vehicle Class
plt.figure(figsize=(12, 6))
sns.barplot(x=vehicle_class_co2_mean.sort_values().index, y=vehicle_class_co2_mean.sort_values().values)
plt.title('Mean CO2 Emissions by Vehicle Class')
plt.xlabel('Vehicle Class')
plt.ylabel('Mean CO2 Emissions')
plt.xticks(rotation=90)
plt.show()

# Plotting the Mean CO2 Emissions by Fuel Type
plt.figure(figsize=(12, 6))
sns.barplot(x=fuel_type_co2_mean.sort_values().index, y=fuel_type_co2_mean.sort_values().values)
plt.title('Mean CO2 Emissions by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Mean CO2 Emissions')
plt.xticks(rotation=90)
plt.show()

# Group data by categories and calculate mean CO2 emissions
engine_size_co2_mean = data.groupby('engine_size')['co2'].mean()
cylinders_co2_mean = data.groupby('cylinders')['co2'].mean()
transmission_co2_mean = data.groupby('transmission')['co2'].mean()

# Plotting the Mean CO2 Emissions by Engine Size
plt.figure(figsize=(12, 6))
sns.barplot(x=engine_size_co2_mean.sort_values().index, y=engine_size_co2_mean.sort_values().values)
plt.title('Mean CO2 Emissions by Engine Size')
plt.xlabel('Engine Size')
plt.ylabel('Mean CO2 Emissions')
plt.xticks(rotation=90)
plt.show()

# Plotting the Mean CO2 Emissions by Cylinders
plt.figure(figsize=(12, 6))
sns.barplot(x=cylinders_co2_mean.sort_values().index, y=cylinders_co2_mean.sort_values().values)
plt.title('Mean CO2 Emissions by Cylinders')
plt.xlabel('Cylinders')
plt.ylabel('Mean CO2 Emissions')
plt.xticks(rotation=90)
plt.show()

# Plotting the Mean CO2 Emissions by Transmission
plt.figure(figsize=(12, 6))
sns.barplot(x=transmission_co2_mean.sort_values().index, y=transmission_co2_mean.sort_values().values)
plt.title('Mean CO2 Emissions by Transmission')
plt.xlabel('Transmission')
plt.ylabel('Mean CO2 Emissions')
plt.xticks(rotation=90)
plt.show()

#Co2 Emission also differs with Make, Vehicle Class and Transmission
#Fuel Type has a small effect on Co2 Emission
#Engine Size and Cylinders have a clear effect in increasing Co2 Emission

In [None]:
# Selecting features and target variable for the new model
selected_features = ['engine_size', 'cylinders', 'fuel_cons_comb', 'fuel_cons_city', 'fuel_cons_hwy']
X_new = data[selected_features]
y_new = data['co2']

# Convert X_new and y_new to numeric types
X_new = X_new.apply(pd.to_numeric, errors='coerce')
y_new = pd.to_numeric(y_new, errors='coerce')

# Split the data into training and testing sets for the new model
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

# Initialize the linear regression model for the new features
model_new = LinearRegression()

# Train the new model
model_new.fit(X_train_new, y_train_new)

# Make predictions on the test set for the new model
y_pred_new = model_new.predict(X_test_new)

# Evaluate the new model
mse_new = mean_squared_error(y_test_new, y_pred_new)
r2_new = r2_score(y_test_new, y_pred_new)

print(f'New Model - Mean Squared Error: {mse_new}')
print(f'New Model - R-squared: {r2_new}')

In [None]:
# Initialize the Random Forest model for the new features
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(X_train_new, y_train_new)

# Make predictions on the test set using the Random Forest model
rf_y_pred = rf_model.predict(X_test_new)

# Evaluate the Random Forest model
rf_mse = mean_squared_error(y_test_new, rf_y_pred)
rf_r2 = r2_score(y_test_new, rf_y_pred)

print(f'Random Forest Model - Mean Squared Error: {rf_mse}')
print(f'Random Forest Model - R-squared: {rf_r2}')

In [None]:
# Initialize the KNN model for the new features
knn_model = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors as needed

# Train the KNN model
knn_model.fit(X_train_new, y_train_new)

# Make predictions on the test set using the KNN model
knn_y_pred = knn_model.predict(X_test_new)

# Evaluate the KNN model
knn_mse = mean_squared_error(y_test_new, knn_y_pred)
knn_r2 = r2_score(y_test_new, knn_y_pred)

print(f'KNN Model - Mean Squared Error: {knn_mse}')
print(f'KNN Model - R-squared: {knn_r2}')