In [1]:
# Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# Download data
if not os.path.exists('../data/laptops.csv'):
    !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv -P ../data

In [3]:
# Read in data as DataFrame
df = pd.read_csv('../data/laptops.csv')
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [4]:
# Normalize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
# Columns to subset
subset_cols = ['ram', 'storage', 'screen', 'final_price']
df_subset = df[subset_cols]
df_subset.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


In [6]:
# Question 1: There's one column with missing values. What is it?
missing_vals = df_subset.isnull().sum()
missing_vals[missing_vals > 0] # 'screen' column

screen    4
dtype: int64

In [7]:
# Question 2: What's the median (50% percentile) for variable 'ram'?
df_subset['ram'].describe()['50%']

16.0

In [8]:
# Code to shuffle and split the dataset into train/valid/test sets (60-20-20 split)

def prepare_datasets(seed):
    # Calculate split sizes
    n = df_subset.shape[0]
    n_valid = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_valid - n_test
    
    # Seed random number generator and shuffle index
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    # Select train, valid, and test subsets
    df_train = df_subset.iloc[idx[:n_train]]
    df_valid = df_subset.iloc[idx[n_train:n_train + n_valid]]
    df_test = df_subset.iloc[idx[n_train + n_valid:]]
    
    # Reset the index for dataframes after shuffling
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    # Separate the target variable (final_price) from the predictor variables
    y_train = df_train['final_price'].values
    y_valid = df_valid['final_price'].values
    y_test = df_test['final_price'].values
    
    # Remove target variable from the feature matrix
    del df_train['final_price']
    del df_valid['final_price']
    del df_test['final_price']

    return df_train, df_valid, df_test, y_train, y_valid, y_test

In [9]:
# Code to train a linear regression model

# Function for training linear regression model with regularization
def train_linear_regression(X, y, r=0.0):
    # Trick to absorb bias parameter into matrix product
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    # Regularize gram matrix to stabilize inverse, and estimate w via left-inverse
    XTX_inv = np.linalg.inv(X.T.dot(X) + r * np.eye(X.shape[1]))
    w = XTX_inv.dot(X.T).dot(y)

    # Return tuple (bias, weights)
    return (w[0], w[1:])

# Linear regression hypothesis function
def linear_regression_model(X, w0, w):
    return w0 + X.dot(w)

# RMSE
def rmse(y, y_pred):
    return np.sqrt(((y - y_pred) ** 2).mean())

In [10]:
# For questions 3 and 4: generate train, valid, and test sets with seed=42
df_train, df_valid, df_test, y_train, y_valid, y_test = prepare_datasets(seed=42)

In [11]:
# Question 3:
# We need to deal with missing values for the column from Q1.
# We have two options: fill it with 0 or with the mean of this variable.
# Try both options. For each, train a linear regression model without regularization using the code from the lessons.
# For computing the mean, use the training only!
# Use the validation dataset to evaluate the models and compare the RMSE of each option.
# Round the RMSE scores to 2 decimal digits using round(score, 2)
# Which option gives better RMSE?

# Options: With 0, With mean, Both are equally good

In [12]:
# Question 3a: Impute missing values with zero and train model

# Create feature matrices with NaN values imputed with zero
X_train_zero = df_train.fillna(0, inplace=False).values
X_valid_zero = df_valid.fillna(0, inplace=False).values
X_test_zero = df_test.fillna(0, inplace=False).values

# Train linear regression model and compute validation RMSE
w0, w = train_linear_regression(X_train_zero, y_train)
y_pred = linear_regression_model(X_valid_zero, w0, w)
round(rmse(y_valid, y_pred), 2)

597.36

In [13]:
# Question 3b: Impute missing values with mean and train model

# Compute mean of feature with NaN values from training set
mean_value = df_train['screen'].mean()

# Create copies of train, valid, test dfs and perform mean imputation
df_train_mean = df_train.copy()
df_valid_mean = df_valid.copy()
df_test_mean = df_test.copy()

df_train_mean['screen'] = df_train_mean['screen'].fillna(mean_value)
df_valid_mean['screen'] = df_valid_mean['screen'].fillna(mean_value)
df_test_mean['screen'] = df_test_mean['screen'].fillna(mean_value)

X_train_mean = df_train_mean.values
X_valid_mean = df_valid_mean.values
X_test_mean = df_test_mean.values

# Train linear regression model and compute validation RMSE
w0, w = train_linear_regression(X_train_mean, y_train)
y_pred = linear_regression_model(X_valid_mean, w0, w)
round(rmse(y_valid, y_pred), 2)

600.27

In [14]:
# Question 4:
# Now let's train a regularized linear regression.
# For this question, fill the NAs with 0.
# Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
# Use RMSE to evaluate the model on the validation dataset.
# Round the RMSE scores to 2 decimal digits.
# Which r gives the best RMSE?
# If there are multiple options, select the smallest r.

# Options: 0, 0.01, 1, 10, 100

# All values of r that we want to evaluate
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
combinations = []

# Cycle through candidate values of r and train model for each value, recording val RMSE
for r in r_values:
    w0, w = train_linear_regression(X_train_zero, y_train, r=r)
    y_pred = linear_regression_model(X_valid_zero, w0, w)
    combinations.append([r, round(rmse(y_valid, y_pred), 2)])
    print('r: %s\trmse:%s' % (r, round(rmse(y_valid, y_pred), 2)))

# Select optimal regularization strength r as the one that minimizes validation RMSE
choices = [0, 0.01, 1, 10, 100]
filtered_combinations = list(filter(lambda x : x[0] in choices, combinations))
r_opt = sorted(filtered_combinations, key=lambda x : x[1])[0][0]
print('Optimal r: %s' % r_opt)

r: 0	rmse:597.36
r: 0.01	rmse:597.36
r: 0.1	rmse:597.35
r: 1	rmse:597.21
r: 5	rmse:597.01
r: 10	rmse:597.06
r: 100	rmse:597.9
Optimal r: 10


In [15]:
# Question 5
# We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
# Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
# For each seed, do the train/validation/test split with 60%/20%/20% distribution.
# Fill the missing values with 0 and train a model without regularization.
# For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
# What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
# Round the result to 3 decimal digits (round(std, 3))
# What's the value of std? 19.176 29.176 39.176 49.176

# All seed values that we want to evaluate
seed_candidates = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
validation_scores = []
for seed in seed_candidates:
    # Generate train, valid, and test sets with the current seed value
    df_train, df_valid, df_test, y_train, y_valid, y_test = prepare_datasets(seed=seed)

    # Create feature matrices with NaN values imputed with zero
    X_train_zero = df_train.fillna(0, inplace=False).values
    X_valid_zero = df_valid.fillna(0, inplace=False).values
    
    # Train linear regression model and compute validation RMSE
    w0, w = train_linear_regression(X_train_zero, y_train)
    y_pred = linear_regression_model(X_valid_zero, w0, w)
    validation_scores.append(rmse(y_valid, y_pred))

# Report the standard deviation of the validation RMSE across seed values
round(np.std(validation_scores), 3)

29.176

In [16]:
# Question 6
# Split the dataset like previously, use seed 9.
# Combine train and validation datasets.
# Fill the missing values with 0 and train a model with r=0.001.
# What's the RMSE on the test dataset?
# Options: 598.60 608.60 618.60 628.60

# Generate train, valid, and test sets with seed=9
df_train, df_valid, df_test, y_train, y_valid, y_test = prepare_datasets(seed=9)

# Combine train and valid sets to create the full training set
df_full_train = pd.concat([df_train, df_valid]).fillna(0).reset_index(drop=True)
X_full_train = df_full_train.values
y_full_train = np.concatenate([y_train, y_valid])

# Create test feature matrix
X_test_zero = df_test.fillna(0, inplace=False).values

# Train linear regression model and compute test RMSE
w0, w = train_linear_regression(X_full_train, y_full_train)
y_pred = linear_regression_model(X_test_zero, w0, w)
round(rmse(y_test, y_pred), 2)

608.61