In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('car_fuel_efficiency.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


In [3]:
# Prepare the dataset - select only the required columns
required_columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df_filtered = df[required_columns].copy()

print("Filtered dataset shape:", df_filtered.shape)
print("\nFiltered dataset info:")
df_filtered.info()


Filtered dataset shape: (9704, 5)

Filtered dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   horsepower           8996 non-null   float64
 2   vehicle_weight       9704 non-null   float64
 3   model_year           9704 non-null   int64  
 4   fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(3), int64(2)
memory usage: 379.2 KB


In [4]:
# Display the first few rows of the filtered dataset
print("First 10 rows of filtered dataset:")
df_filtered.head(10)


First 10 rows of filtered dataset:


Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369
5,190,,2484.883986,2008,17.271818
6,240,127.0,3006.542287,2012,13.210412
7,150,239.0,3638.65778,2020,12.848884
8,250,174.0,2714.21931,2016,16.823554
9,150,123.0,3509.036569,2005,12.298355


In [6]:
# Question 1: Which column has missing values?

# Check for missing values in the filtered dataset
print("Missing values in filtered dataset:")
missing_values = df_filtered.isnull().sum()
print(missing_values)

print("\nAnswer to Question 1:")
for col in required_columns:
    if missing_values[col] > 0:
        print(f"✓ {col} has {missing_values[col]} missing values")
    else:
        print(f"✗ {col} has no missing values")


Missing values in filtered dataset:
engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

Answer to Question 1:
✗ engine_displacement has no missing values
✓ horsepower has 708 missing values
✗ vehicle_weight has no missing values
✗ model_year has no missing values
✗ fuel_efficiency_mpg has no missing values


In [7]:
# Question 2: What's the median (50% percentile) for variable 'horsepower'?
print("Question 2: Median horsepower")
print("=" * 40)

# Calculate the median horsepower
median_horsepower = df_filtered['horsepower'].median()
print(f"Median horsepower: {median_horsepower}")

# Also show some statistics for context
print(f"\nHorsepower statistics:")
print(f"Count: {df_filtered['horsepower'].count()}")
print(f"Mean: {df_filtered['horsepower'].mean():.2f}")
print(f"Median: {median_horsepower}")
print(f"Min: {df_filtered['horsepower'].min()}")
print(f"Max: {df_filtered['horsepower'].max()}")



Question 2: Median horsepower
Median horsepower: 149.0

Horsepower statistics:
Count: 8996
Mean: 149.66
Median: 149.0
Min: 37.0
Max: 271.0


In [8]:
# Question 3: Deal with missing values and compare RMSE
print("Question 3: Missing values handling")
print("=" * 50)

# First, let's prepare the train/val/test split as mentioned in the homework
# Shuffle the dataset with seed 42
np.random.seed(42)
n = len(df_filtered)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

# Create shuffled indices
idx = np.arange(n)
np.random.shuffle(idx)

# Split the data
df_train = df_filtered.iloc[idx[:n_train]].copy()
df_val = df_filtered.iloc[idx[n_train:n_train+n_val]].copy()
df_test = df_filtered.iloc[idx[n_train+n_val:]].copy()

# Reset indices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

print(f"Train set: {len(df_train)} samples")
print(f"Validation set: {len(df_val)} samples")
print(f"Test set: {len(df_test)} samples")


Question 3: Missing values handling
Train set: 5824 samples
Validation set: 1940 samples
Test set: 1940 samples


In [9]:
# Define the linear regression training function (from the lessons)
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

# Define RMSE function
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

# Prepare features (excluding target variable)
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

# Get target variables
y_train = df_train[target].values
y_val = df_val[target].values


In [10]:
# Option 1: Fill missing values with 0
print("Option 1: Fill missing values with 0")
print("-" * 40)

# Prepare training data with 0s
X_train_zero = df_train[features].fillna(0).values
X_val_zero = df_val[features].fillna(0).values

# Train model
w0_zero, w_zero = train_linear_regression(X_train_zero, y_train)

# Make predictions
y_pred_zero = w0_zero + X_val_zero.dot(w_zero)

# Calculate RMSE
rmse_zero = rmse(y_val, y_pred_zero)
print(f"RMSE (fill with 0): {rmse_zero:.2f}")


Option 1: Fill missing values with 0
----------------------------------------
RMSE (fill with 0): 0.52


In [11]:
# Option 2: Fill missing values with mean (calculated from training data only)
print("\nOption 2: Fill missing values with mean")
print("-" * 40)

# Calculate mean from training data only (as specified in homework)
horsepower_mean = df_train['horsepower'].mean()
print(f"Mean horsepower from training data: {horsepower_mean:.2f}")

# Prepare training data with mean
X_train_mean = df_train[features].copy()
X_train_mean['horsepower'] = X_train_mean['horsepower'].fillna(horsepower_mean)
X_train_mean = X_train_mean.values

# Prepare validation data with same mean
X_val_mean = df_val[features].copy()
X_val_mean['horsepower'] = X_val_mean['horsepower'].fillna(horsepower_mean)
X_val_mean = X_val_mean.values

# Train model
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train)

# Make predictions
y_pred_mean = w0_mean + X_val_mean.dot(w_mean)

# Calculate RMSE
rmse_mean = rmse(y_val, y_pred_mean)
print(f"RMSE (fill with mean): {rmse_mean:.2f}")



Option 2: Fill missing values with mean
----------------------------------------
Mean horsepower from training data: 149.54
RMSE (fill with mean): 0.46


In [13]:
# Compare the results
print("\nComparison:")
print("=" * 30)
print(f"RMSE with 0:     {rmse_zero:.2f}")
print(f"RMSE with mean: {rmse_mean:.2f}")

if rmse_zero < rmse_mean:
    print("\n✓ Option 1 (fill with 0) gives better RMSE")
    answer = "With 0"
elif rmse_mean < rmse_zero:
    print("\n✓ Option 2 (fill with mean) gives better RMSE")
    answer = "With mean"
else:
    print("\n✓ Both options are equally good")
    answer = "Both are equally good"

print(f"\nAnswer to Question 3: {answer}")




Comparison:
RMSE with 0:     0.52
RMSE with mean: 0.46

✓ Option 2 (fill with mean) gives better RMSE

Answer to Question 3: With mean


In [15]:
# Question 4: Regularized linear regression
print("Question 4: Regularized linear regression")
print("=" * 50)

# Define regularized linear regression training function
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])  # Add regularization
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

# Fill missing values with 0 (as specified in question)
X_train_reg = df_train[features].fillna(0).values
X_val_reg = df_val[features].fillna(0).values




Question 4: Regularized linear regression


In [16]:
# Test different values of r
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = []

for r in r_values:
    # Train model with regularization
    w0, w = train_linear_regression_reg(X_train_reg, y_train, r=r)
    
    # Make predictions
    y_pred = w0 + X_val_reg.dot(w)
    
    # Calculate RMSE
    score = rmse(y_val, y_pred)
    rmse_scores.append(score)
    
    print(f"r = {r:6.2f} | RMSE = {score:.2f}")

# Find the best r (lowest RMSE)
best_idx = np.argmin(rmse_scores)
best_r = r_values[best_idx]
best_rmse = rmse_scores[best_idx]

print(f"\nBest regularization parameter:")
print(f"r = {best_r} gives RMSE = {best_rmse:.2f}")


r =   0.00 | RMSE = 0.52
r =   0.01 | RMSE = 0.52
r =   0.10 | RMSE = 0.52
r =   1.00 | RMSE = 0.52
r =   5.00 | RMSE = 0.52
r =  10.00 | RMSE = 0.52
r = 100.00 | RMSE = 0.52

Best regularization parameter:
r = 0.01 gives RMSE = 0.52


In [17]:


print(f"\nAnswer to Question 4: {best_r}")

# If there are multiple options with the same RMSE, select the smallest r
# (This is handled by the order in r_values and argmin selecting the first occurrence)
if best_r in [0, 0.01, 1, 10, 100]:
    print(f"✓ {best_r} is one of the homework options")
else:
    print(f"⚠ {best_r} is not in the homework options - select the closest one")



Answer to Question 4: 0.01
✓ 0.01 is one of the homework options


In [18]:
# Question 5: Testing different seed values
print("Question 5: Testing different seed values")
print("=" * 50)

# Test different seed values
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores_seeds = []

print("Testing different seeds...")
print("-" * 30)

for seed in seeds:
    # Set the seed
    np.random.seed(seed)
    
    # Split the data with this seed
    n = len(df_filtered)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    
    # Create shuffled indices
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    # Split the data
    df_train_seed = df_filtered.iloc[idx[:n_train]].copy()
    df_val_seed = df_filtered.iloc[idx[n_train:n_train+n_val]].copy()
    
    # Reset indices
    df_train_seed = df_train_seed.reset_index(drop=True)
    df_val_seed = df_val_seed.reset_index(drop=True)
    
    # Prepare data (fill missing values with 0)
    X_train_seed = df_train_seed[features].fillna(0).values
    X_val_seed = df_val_seed[features].fillna(0).values
    y_train_seed = df_train_seed[target].values
    y_val_seed = df_val_seed[target].values
    
    # Train model without regularization
    w0, w = train_linear_regression(X_train_seed, y_train_seed)
    
    # Make predictions
    y_pred_seed = w0 + X_val_seed.dot(w)
    
    # Calculate RMSE
    score = rmse(y_val_seed, y_pred_seed)
    rmse_scores_seeds.append(score)
    
    print(f"Seed {seed:2d}: RMSE = {score:.3f}")

print(f"\nRMSE scores for all seeds: {[round(score, 3) for score in rmse_scores_seeds]}")


Question 5: Testing different seed values
Testing different seeds...
------------------------------
Seed  0: RMSE = 0.521
Seed  1: RMSE = 0.521
Seed  2: RMSE = 0.523
Seed  3: RMSE = 0.516
Seed  4: RMSE = 0.511
Seed  5: RMSE = 0.528
Seed  6: RMSE = 0.531
Seed  7: RMSE = 0.509
Seed  8: RMSE = 0.515
Seed  9: RMSE = 0.513

RMSE scores for all seeds: [np.float64(0.521), np.float64(0.521), np.float64(0.523), np.float64(0.516), np.float64(0.511), np.float64(0.528), np.float64(0.531), np.float64(0.509), np.float64(0.515), np.float64(0.513)]


In [19]:
# Calculate standard deviation
std_rmse = np.std(rmse_scores_seeds)
print(f"\nStandard deviation of RMSE scores: {std_rmse:.3f}")

# Find the closest option
options = [0.001, 0.006, 0.060, 0.600]
closest_option = min(options, key=lambda x: abs(x - std_rmse))
print(f"\nClosest option to {std_rmse:.3f}: {closest_option}")

print(f"\nAnswer to Question 5: {closest_option}")



Standard deviation of RMSE scores: 0.007

Closest option to 0.007: 0.006

Answer to Question 5: 0.006


In [20]:
# Question 6: Final model training and testing
print("Question 6: Final model on test dataset")
print("=" * 50)

# Split the dataset with seed 9 (as specified)
np.random.seed(9)
n = len(df_filtered)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

# Create shuffled indices
idx = np.arange(n)
np.random.shuffle(idx)

# Split the data
df_train_final = df_filtered.iloc[idx[:n_train]].copy()
df_val_final = df_filtered.iloc[idx[n_train:n_train+n_val]].copy()
df_test_final = df_filtered.iloc[idx[n_train+n_val:]].copy()

# Reset indices
df_train_final = df_train_final.reset_index(drop=True)
df_val_final = df_val_final.reset_index(drop=True)
df_test_final = df_test_final.reset_index(drop=True)

print(f"Train set: {len(df_train_final)} samples")
print(f"Validation set: {len(df_val_final)} samples")
print(f"Test set: {len(df_test_final)} samples")


Question 6: Final model on test dataset
Train set: 5824 samples
Validation set: 1940 samples
Test set: 1940 samples


In [21]:
# Combine train and validation datasets
print("Combining train and validation datasets...")
df_full_train = pd.concat([df_train_final, df_val_final])
df_full_train = df_full_train.reset_index(drop=True)

print(f"Combined train+val set: {len(df_full_train)} samples")

# Prepare the combined training data
X_full_train = df_full_train[features].fillna(0).values
y_full_train = df_full_train[target].values

# Prepare test data
X_test_final = df_test_final[features].fillna(0).values
y_test_final = df_test_final[target].values

print("Training final model with r=0.001...")


Combining train and validation datasets...
Combined train+val set: 7764 samples
Training final model with r=0.001...


In [22]:
# Train the final model with r=0.001
w0_final, w_final = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

# Make predictions on test set
y_pred_test = w0_final + X_test_final.dot(w_final)

# Calculate RMSE on test set
rmse_test = rmse(y_test_final, y_pred_test)

print(f"RMSE on test dataset: {rmse_test:.3f}")

# Find the closest option
options = [0.15, 0.515, 5.15, 51.5]
closest_option = min(options, key=lambda x: abs(x - rmse_test))
print(f"\nClosest option to {rmse_test:.3f}: {closest_option}")

print(f"\nAnswer to Question 6: {closest_option}")


RMSE on test dataset: 0.516

Closest option to 0.516: 0.515

Answer to Question 6: 0.515
