In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('/home/regent/Downloads/DataTalk/01/car_fuel_efficiency.csv')

# Select the specified columns
columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[columns]

# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

# Split into train and validation sets (80-20 split)
np.random.seed(42)
n = len(df)
n_val = int(0.2 * n)
n_train = n - n_val

idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]].copy()
df_val = df.iloc[idx[n_train:]].copy()

print(f"\nTraining set size: {len(df_train)}")
print(f"Validation set size: {len(df_val)}")

# Linear regression implementation from lessons
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

def prepare_data(df, feature_columns, target_column, fill_method='zero', train_means=None):
    """Prepare data with specified missing value handling method"""
    df_processed = df.copy()
    
    if fill_method == 'zero':
        # Fill missing values with 0
        df_processed = df_processed.fillna(0)
    elif fill_method == 'mean':
        # Fill missing values with means (from training data)
        if train_means is None:
            train_means = df_processed.mean()
        df_processed = df_processed.fillna(train_means)
    
    X = df_processed[feature_columns].values
    y = df_processed[target_column].values
    
    return X, y, train_means

# Option 1: Fill with 0
print("\n=== Option 1: Fill with 0 ===")
feature_cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target_col = 'fuel_efficiency_mpg'

# Prepare training data (fill with 0)
X_train_zero, y_train_zero, _ = prepare_data(df_train, feature_cols, target_col, 'zero')
X_val_zero, y_val_zero, _ = prepare_data(df_val, feature_cols, target_col, 'zero')

# Train model
w0_zero, w_zero = train_linear_regression(X_train_zero, y_train_zero)

# Make predictions on validation set
ones_val = np.ones(X_val_zero.shape[0])
X_val_with_ones_zero = np.column_stack([ones_val, X_val_zero])
y_pred_zero = X_val_with_ones_zero.dot(np.concatenate([[w0_zero], w_zero]))

# Calculate RMSE
rmse_zero = rmse(y_val_zero, y_pred_zero)
print(f"RMSE (fill with 0): {rmse_zero:.2f}")

# Option 2: Fill with mean (using training data means)
print("\n=== Option 2: Fill with mean ===")
# Calculate means from training data only
train_means = df_train[feature_cols].mean()

# Prepare training data (fill with mean)
X_train_mean, y_train_mean, _ = prepare_data(df_train, feature_cols, target_col, 'mean')
X_val_mean, y_val_mean, _ = prepare_data(df_val, feature_cols, target_col, 'mean', train_means)

# Train model
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train_mean)

# Make predictions on validation set
ones_val = np.ones(X_val_mean.shape[0])
X_val_with_ones_mean = np.column_stack([ones_val, X_val_mean])
y_pred_mean = X_val_with_ones_mean.dot(np.concatenate([[w0_mean], w_mean]))

# Calculate RMSE
rmse_mean = rmse(y_val_mean, y_pred_mean)
print(f"RMSE (fill with mean): {rmse_mean:.2f}")

# Compare results
print("\n=== Comparison ===")
print(f"RMSE (fill with 0): {round(rmse_zero, 2)}")
print(f"RMSE (fill with mean): {round(rmse_mean, 2)}")

if rmse_zero < rmse_mean:
    print("Better option: Fill with 0")
else:
    print("Better option: Fill with mean")

Missing values in each column:
engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

Training set size: 7764
Validation set size: 1940

=== Option 1: Fill with 0 ===
RMSE (fill with 0): 0.52

=== Option 2: Fill with mean ===
RMSE (fill with mean): 0.46

=== Comparison ===
RMSE (fill with 0): 0.52
RMSE (fill with mean): 0.46
Better option: Fill with mean


In [2]:
import pandas as pd
import numpy as np

# Load and prepare data
df = pd.read_csv('/home/regent/Downloads/DataTalk/01/car_fuel_efficiency.csv')
columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[columns]

print("Dataset shape:", df.shape)
print("\nMissing values:")
print(df.isnull().sum())

# Split data
np.random.seed(42)
n = len(df)
n_val = int(0.2 * n)
idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:-n_val]].copy()
df_val = df.iloc[idx[-n_val:]].copy()

print(f"\nTraining: {len(df_train)}, Validation: {len(df_val)}")

# Linear regression functions
def linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T @ X
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv @ X.T @ y
    return w

def predict(X, w):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    return X @ w

def rmse(y_true, y_pred):
    return np.sqrt(((y_true - y_pred) ** 2).mean())

# Method 1: Fill with 0
print("\n" + "="*50)
print("METHOD 1: Filling missing values with 0")
print("="*50)

# Prepare data - fill with 0
X_train_zero = df_train[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values
y_train_zero = df_train['fuel_efficiency_mpg'].values

X_val_zero = df_val[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values
y_val_zero = df_val['fuel_efficiency_mpg'].values

# Train model
w_zero = linear_regression(X_train_zero, y_train_zero)

# Predict and evaluate
y_pred_zero = predict(X_val_zero, w_zero)
rmse_zero = rmse(y_val_zero, y_pred_zero)

print(f"RMSE (fill with 0): {rmse_zero:.2f}")

# Method 2: Fill with mean (from training data only)
print("\n" + "="*50)
print("METHOD 2: Filling missing values with mean from training data")
print("="*50)

# Calculate means from training data only
train_means = df_train[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].mean()

print("Training means:")
print(train_means)

# Prepare data - fill with mean
X_train_mean = df_train[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(train_means).values
y_train_mean = df_train['fuel_efficiency_mpg'].values

X_val_mean = df_val[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(train_means).values
y_val_mean = df_val['fuel_efficiency_mpg'].values

# Train model
w_mean = linear_regression(X_train_mean, y_train_mean)

# Predict and evaluate
y_pred_mean = predict(X_val_mean, w_mean)
rmse_mean = rmse(y_val_mean, y_pred_mean)

print(f"RMSE (fill with mean): {rmse_mean:.2f}")

# Final comparison
print("\n" + "="*50)
print("FINAL COMPARISON")
print("="*50)
print(f"RMSE (fill with 0): {round(rmse_zero, 2)}")
print(f"RMSE (fill with mean): {round(rmse_mean, 2)}")

if round(rmse_zero, 2) < round(rmse_mean, 2):
    print("\n✓ Better option: Fill with 0")
elif round(rmse_zero, 2) > round(rmse_mean, 2):
    print("\n✓ Better option: Fill with mean")
else:
    print("\n✓ Both methods give the same RMSE")

Dataset shape: (9704, 5)

Missing values:
engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

Training: 7764, Validation: 1940

METHOD 1: Filling missing values with 0
RMSE (fill with 0): 0.52

METHOD 2: Filling missing values with mean from training data
Training means:
engine_displacement     199.781041
horsepower              149.667037
vehicle_weight         3002.950706
model_year             2011.412416
dtype: float64
RMSE (fill with mean): 0.46

FINAL COMPARISON
RMSE (fill with 0): 0.52
RMSE (fill with mean): 0.46

✓ Better option: Fill with mean


In [3]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('/home/regent/Downloads/DataTalk/01/car_fuel_efficiency.csv')

# Select the specified columns
columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[columns]

# Check which columns have missing values
print("Missing values in each column:")
missing_counts = df.isnull().sum()
print(missing_counts)

# Identify columns with missing values
columns_with_missing = missing_counts[missing_counts > 0].index.tolist()
print(f"\nColumns with missing values: {columns_with_missing}")

# Split into train and validation sets
np.random.seed(42)
n = len(df)
n_val = int(0.2 * n)
n_train = n - n_val

idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]].copy()
df_val = df.iloc[idx[n_train:]].copy()

print(f"\nTraining set size: {len(df_train)}")
print(f"Validation set size: {len(df_val)}")

# Linear regression implementation
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

def prepare_data(df, feature_columns, fill_method='zero', train_means=None):
    """Prepare data, filling only columns that have missing values"""
    df_processed = df.copy()
    
    if fill_method == 'zero':
        # Fill only columns with missing values with 0
        for col in columns_with_missing:
            if col in df_processed.columns:
                df_processed[col] = df_processed[col].fillna(0)
    
    elif fill_method == 'mean':
        # Fill only columns with missing values with means (from training data)
        if train_means is None:
            train_means = {}
            for col in columns_with_missing:
                if col in df_processed.columns:
                    train_means[col] = df_processed[col].mean()
        
        for col in columns_with_missing:
            if col in df_processed.columns:
                df_processed[col] = df_processed[col].fillna(train_means[col])
    
    X = df_processed[feature_columns].values
    y = df_processed['fuel_efficiency_mpg'].values
    
    return X, y, train_means

# Feature columns (all except target)
feature_cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

# Option 1: Fill only missing columns with 0
print("\n=== Option 1: Fill only missing columns with 0 ===")
X_train_zero, y_train_zero, _ = prepare_data(df_train, feature_cols, 'zero')
X_val_zero, y_val_zero, _ = prepare_data(df_val, feature_cols, 'zero')

# Train model
w0_zero, w_zero = train_linear_regression(X_train_zero, y_train_zero)

# Make predictions on validation set
ones_val = np.ones(X_val_zero.shape[0])
X_val_with_ones_zero = np.column_stack([ones_val, X_val_zero])
y_pred_zero = X_val_with_ones_zero.dot(np.concatenate([[w0_zero], w_zero]))

# Calculate RMSE
rmse_zero = rmse(y_val_zero, y_pred_zero)
print(f"RMSE (fill missing with 0): {rmse_zero:.2f}")

# Option 2: Fill only missing columns with mean (using training data means)
print("\n=== Option 2: Fill only missing columns with mean ===")
# Calculate means from training data only for columns with missing values
train_means_dict = {}
for col in columns_with_missing:
    if col in df_train.columns:
        train_means_dict[col] = df_train[col].mean()

print("Training means for columns with missing values:")
for col, mean_val in train_means_dict.items():
    print(f"  {col}: {mean_val:.2f}")

# Prepare data
X_train_mean, y_train_mean, _ = prepare_data(df_train, feature_cols, 'mean')
X_val_mean, y_val_mean, _ = prepare_data(df_val, feature_cols, 'mean', train_means_dict)

# Train model
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train_mean)

# Make predictions on validation set
ones_val = np.ones(X_val_mean.shape[0])
X_val_with_ones_mean = np.column_stack([ones_val, X_val_mean])
y_pred_mean = X_val_with_ones_mean.dot(np.concatenate([[w0_mean], w_mean]))

# Calculate RMSE
rmse_mean = rmse(y_val_mean, y_pred_mean)
print(f"RMSE (fill missing with mean): {rmse_mean:.2f}")

# Compare results
print("\n=== Final Comparison ===")
print(f"RMSE (fill missing with 0): {round(rmse_zero, 2)}")
print(f"RMSE (fill missing with mean): {round(rmse_mean, 2)}")

if round(rmse_zero, 2) < round(rmse_mean, 2):
    print("✓ Better option: Fill missing values with 0")
else:
    print("✓ Better option: Fill missing values with mean")

Missing values in each column:
engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

Columns with missing values: ['horsepower']

Training set size: 7764
Validation set size: 1940

=== Option 1: Fill only missing columns with 0 ===
RMSE (fill missing with 0): 0.52

=== Option 2: Fill only missing columns with mean ===
Training means for columns with missing values:
  horsepower: 149.67
RMSE (fill missing with mean): 0.46

=== Final Comparison ===
RMSE (fill missing with 0): 0.52
RMSE (fill missing with mean): 0.46
✓ Better option: Fill missing values with mean


In [4]:
import pandas as pd
import numpy as np

# Load and prepare data
df = pd.read_csv('/home/regent/Downloads/DataTalk/01/car_fuel_efficiency.csv')
columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[columns]

print("Missing values analysis:")
print(df.isnull().sum())

# Identify which columns actually have missing values
missing_cols = df.columns[df.isnull().any()].tolist()
print(f"\nColumns with missing values: {missing_cols}")

# Split data
np.random.seed(42)
n_val = int(0.2 * len(df))
idx = np.arange(len(df))
np.random.shuffle(idx)

df_train = df.iloc[idx[:-n_val]].copy()
df_val = df.iloc[idx[-n_val:]].copy()

# Linear regression functions
def linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T @ X
    XTX_inv = np.linalg.inv(XTX)
    return XTX_inv @ X.T @ y

def predict(X, w):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    return X @ w

def rmse(y_true, y_pred):
    return np.sqrt(((y_true - y_pred) ** 2).mean())

# Method 1: Fill only missing columns with 0
print("\nMETHOD 1: Fill only missing columns with 0")
df_train_zero = df_train.copy()
df_val_zero = df_val.copy()

# Only fill columns that have missing values
for col in missing_cols:
    df_train_zero[col] = df_train_zero[col].fillna(0)
    df_val_zero[col] = df_val_zero[col].fillna(0)

X_train_zero = df_train_zero[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].values
y_train_zero = df_train_zero['fuel_efficiency_mpg'].values
X_val_zero = df_val_zero[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].values
y_val_zero = df_val_zero['fuel_efficiency_mpg'].values

w_zero = linear_regression(X_train_zero, y_train_zero)
y_pred_zero = predict(X_val_zero, w_zero)
rmse_zero = rmse(y_val_zero, y_pred_zero)

# Method 2: Fill only missing columns with mean from training data
print("\nMETHOD 2: Fill only missing columns with mean from training data")
df_train_mean = df_train.copy()
df_val_mean = df_val.copy()

# Calculate means only for columns with missing values, using training data only
train_means = {}
for col in missing_cols:
    train_means[col] = df_train[col].mean()

# Fill only columns with missing values
for col in missing_cols:
    df_train_mean[col] = df_train_mean[col].fillna(train_means[col])
    df_val_mean[col] = df_val_mean[col].fillna(train_means[col])

X_train_mean = df_train_mean[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].values
y_train_mean = df_train_mean['fuel_efficiency_mpg'].values
X_val_mean = df_val_mean[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].values
y_val_mean = df_val_mean['fuel_efficiency_mpg'].values

w_mean = linear_regression(X_train_mean, y_train_mean)
y_pred_mean = predict(X_val_mean, w_mean)
rmse_mean = rmse(y_val_mean, y_pred_mean)

# Results
print(f"\nFINAL RESULTS:")
print(f"RMSE (fill missing with 0): {round(rmse_zero, 2)}")
print(f"RMSE (fill missing with mean): {round(rmse_mean, 2)}")

if round(rmse_zero, 2) < round(rmse_mean, 2):
    print("✓ Better option: Fill missing values with 0")
else:
    print("✓ Better option: Fill missing values with mean")

Missing values analysis:
engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

Columns with missing values: ['horsepower']

METHOD 1: Fill only missing columns with 0

METHOD 2: Fill only missing columns with mean from training data

FINAL RESULTS:
RMSE (fill missing with 0): 0.52
RMSE (fill missing with mean): 0.46
✓ Better option: Fill missing values with mean
