Q1


In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import pandas as pd

file_path = '/content/drive/MyDrive/Colab Notebooks/Dleete_this/car_fuel_efficiency.csv'
df = pd.read_csv(file_path, na_values=['', ' '])

In [3]:
cols_of_interest = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df_subset = df[cols_of_interest]

missing_counts = df_subset.isnull().sum()
print(missing_counts)

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64


Q2

In [4]:
median_horsepower = df['horsepower'].median()
print(median_horsepower)

149.0


Q3

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

Creating training, testing, and validation sets

In [6]:
df_filtered = df[cols_of_interest].copy()

df_shuffled = df_filtered.sample(frac=1, random_state=42).reset_index(drop=True)
df_shuffled

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,220,144.0,2535.887591,2009,16.642943
1,160,141.0,2741.170484,2019,16.298377
2,230,155.0,2471.880237,2017,18.591822
3,150,206.0,3748.164469,2015,11.818843
4,300,111.0,2135.716359,2006,19.402209
...,...,...,...,...,...
9699,210,163.0,1972.029124,2011,19.961672
9700,160,126.0,3011.588014,2009,14.651056
9701,290,187.0,2440.508039,2019,18.404435
9702,260,129.0,1865.404480,2019,20.502460


In [7]:
# Training set
df_train, df_temp = train_test_split(
    df_shuffled,
    test_size=0.4,
    random_state=42
)

# Testing and Validation set
df_test, df_val = train_test_split(
    df_temp,
    test_size=0.5,
    random_state=42
)

In [8]:
df_train

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
2155,120,98.0,2152.640871,2008,18.830833
8916,190,140.0,2905.379446,2019,14.829684
3274,170,96.0,3782.290474,2014,10.316109
6307,240,137.0,3640.501744,2007,12.173032
8186,240,182.0,3003.041960,2022,15.141689
...,...,...,...,...,...
5734,220,210.0,3283.814417,2002,13.392067
5191,260,146.0,2515.437193,2016,17.176426
5390,240,194.0,2878.383283,2002,16.275557
860,170,175.0,2616.554012,2005,17.393379


In [9]:
df_test

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
1683,190,186.0,2955.506105,2021,14.370227
4267,210,120.0,3709.433973,2013,9.945553
1926,160,154.0,2876.536456,2018,15.719309
2933,220,181.0,3280.190782,2012,13.960268
334,230,175.0,2626.784941,2003,17.302281
...,...,...,...,...,...
2110,160,179.0,3791.505088,2008,12.175115
1530,190,162.0,2299.328013,2022,18.330736
3616,190,162.0,2039.200382,2019,20.333055
7799,180,151.0,2517.988344,2012,17.480960


In [10]:
df_val

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
3973,220,181.0,2563.027777,2014,17.411819
1018,170,,3258.493827,2004,14.245590
2145,140,120.0,2059.906745,2022,19.820801
5338,170,215.0,2702.545113,2016,17.277785
3905,210,137.0,3417.292228,2016,12.757799
...,...,...,...,...,...
8329,180,166.0,3212.530163,2012,14.024731
7759,290,205.0,3403.401496,2019,13.868607
2006,230,137.0,2495.147187,2019,17.502574
4864,270,136.0,3919.577527,2017,9.837325


In [11]:
# Separate features (X) and target (y)
X_train_raw = df_train.drop('fuel_efficiency_mpg', axis=1).copy()
y_train = df_train['fuel_efficiency_mpg'].copy()
X_val_raw = df_val.drop('fuel_efficiency_mpg', axis=1).copy()
y_val = df_val['fuel_efficiency_mpg'].copy()

In [12]:
print(f"NaNs in training set 'horsepower' before imputation: {X_train_raw['horsepower'].isnull().sum()}")

NaNs in training set 'horsepower' before imputation: 420


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

missing_col = 'horsepower'
results = {}

In [14]:
# Strategy 1: Impute with 0
X_train_zero = X_train_raw.fillna(0)
X_val_zero = X_val_raw.fillna(0)

# Train Model
model_zero = LinearRegression()
model_zero.fit(X_train_zero, y_train)

# Predict and Evaluate on Validation Set
y_val_pred_zero = model_zero.predict(X_val_zero)
rmse_zero = np.sqrt(mean_squared_error(y_val, y_val_pred_zero))
results['Fill with 0'] = round(rmse_zero, 2)


# Strategy 2: Impute with Training Mean
train_mean_hp = X_train_raw[missing_col].mean()
X_train_mean = X_train_raw.fillna(train_mean_hp)
X_val_mean = X_val_raw.fillna(train_mean_hp)

# Train Model
model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train)

# Predict and Evaluate on Validation Set
y_val_pred_mean = model_mean.predict(X_val_mean)
rmse_mean = np.sqrt(mean_squared_error(y_val, y_val_pred_mean))
results['Fill with Training Mean'] = round(rmse_mean, 2)

In [15]:
# Comparison

for strategy, rmse in results.items():
    print(f"{strategy}: {rmse}")

Fill with 0: 0.52
Fill with Training Mean: 0.46


Q4

In [16]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [17]:
X_train = X_train_raw.copy()
X_val = X_val_raw.copy()

missing_col = 'horsepower'
alpha_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

In [18]:
# Fill NAs with 0
X_train_imputed = X_train.fillna(0)
X_val_imputed = X_val.fillna(0)

# Standardize Data (needed for regularization)
scaler = StandardScaler()
scaler.fit(X_train_imputed)

# Scale both train and val sets
X_train_scaled = scaler.transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

In [19]:
# Evaluate Models for each alpha (r)

for alpha in alpha_values:
    model_ridge = Ridge(alpha=alpha, random_state=42)
    model_ridge.fit(X_train_scaled, y_train)
    y_val_pred = model_ridge.predict(X_val_scaled)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    rmse_scores[alpha] = round(rmse, 2)

    print(f"Alpha (r) = {alpha:<5}: RMSE = {rmse_scores[alpha]}")

Alpha (r) = 0    : RMSE = 0.52
Alpha (r) = 0.01 : RMSE = 0.52
Alpha (r) = 0.1  : RMSE = 0.52
Alpha (r) = 1    : RMSE = 0.52
Alpha (r) = 5    : RMSE = 0.52
Alpha (r) = 10   : RMSE = 0.52
Alpha (r) = 100  : RMSE = 0.52


Q5

In [22]:
X_raw = df_filtered.drop('fuel_efficiency_mpg', axis=1).copy()
y = df_filtered['fuel_efficiency_mpg'].copy()

# List of seeds to test
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

In [23]:
for seed in seeds:
    # Split Data
    X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
        X_raw, y, test_size=0.4, random_state=seed
    )
    X_val_raw, _, y_val, _ = train_test_split(
        X_temp_raw, y_temp, test_size=0.5, random_state=seed
    )

    # Fill NAs with 0
    X_train_imputed = X_train_raw.fillna(0)
    X_val_imputed = X_val_raw.fillna(0)

    # Train Model
    model = LinearRegression()
    model.fit(X_train_imputed, y_train)

    # Evaluate on Validation Set
    y_val_pred = model.predict(X_val_imputed)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    rmse_scores.append(rmse)

    print(f"Seed {seed}: RMSE = {rmse:.4f}")

# Calculate Standard Deviation of all RMSE scores
std_of_scores = np.std(rmse_scores)
rounded_std = round(std_of_scores, 3)

print(f"RMSE Scores Collected: {np.round(rmse_scores, 4)}")
print(f"Standard Deviation of all RMSE scores: {rounded_std}")

Seed 0: RMSE = 0.5180
Seed 1: RMSE = 0.5171
Seed 2: RMSE = 0.5198
Seed 3: RMSE = 0.5217
Seed 4: RMSE = 0.5114
Seed 5: RMSE = 0.5151
Seed 6: RMSE = 0.5146
Seed 7: RMSE = 0.5357
Seed 8: RMSE = 0.5185
Seed 9: RMSE = 0.5198
RMSE Scores Collected: [0.518  0.5171 0.5198 0.5217 0.5114 0.5151 0.5146 0.5357 0.5185 0.5198]
Standard Deviation of all RMSE scores: 0.006


Q6

In [24]:
X_raw = df_filtered.drop('fuel_efficiency_mpg', axis=1)
y = df_filtered['fuel_efficiency_mpg']
SEED = 9

In [25]:
# Split dataset
X_train_val_raw, X_test_raw, y_train_val, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=SEED
)

# Fill NA with 0
X_train_val_imputed = X_train_val_raw.fillna(0)
X_test_imputed = X_test_raw.fillna(0)

# Train mode with r=0.001
ALPHA = 0.001

model = Ridge(alpha=ALPHA, random_state=SEED)
model.fit(X_train_val_imputed, y_train_val)

# Evaluate on Test set
y_test_pred = model.predict(X_test_imputed)
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Round the result
rounded_rmse = round(rmse, 3)

print(f"Test Set RMSE (r={ALPHA}, Imputation=0): {rounded_rmse}")

Test Set RMSE (r=0.001, Imputation=0): 0.521
