In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [21]:
# Load the dataset from the URL
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
df = pd.read_csv(url)
df

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.00
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.00
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.00
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.00
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01
...,...,...,...,...,...,...,...,...,...,...,...,...
2155,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3060,17.3,No,2699.99
2156,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3070,17.3,No,2899.99
2157,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,32,1000,SSD,RTX 3080,17.3,No,3399.99
2158,Razer Book 13 Intel Evo Core i7-1165G7/16GB/1T...,Refurbished,Razer,Book,Intel Evo Core i7,16,1000,SSD,,13.4,Yes,1899.99


In [23]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.00
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.00
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.00
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.00
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01
...,...,...,...,...,...,...,...,...,...,...,...,...
2155,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3060,17.3,No,2699.99
2156,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3070,17.3,No,2899.99
2157,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,32,1000,SSD,RTX 3080,17.3,No,3399.99
2158,Razer Book 13 Intel Evo Core i7-1165G7/16GB/1T...,Refurbished,Razer,Book,Intel Evo Core i7,16,1000,SSD,,13.4,Yes,1899.99


In [25]:
columns = ['ram', 'storage', 'screen', 'final_price']
subset_data = df[columns]
subset_data

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.00
1,8,256,15.6,299.00
2,8,256,15.6,789.00
3,16,1000,15.6,1199.00
4,16,512,15.6,669.01
...,...,...,...,...
2155,16,1000,17.3,2699.99
2156,16,1000,17.3,2899.99
2157,32,1000,17.3,3399.99
2158,16,1000,13.4,1899.99


### Question 1: There's one column with missing values. What is it?

* `'ram'`
* `'storage'`
* `'screen'`
* `'final_price'`

In [26]:
# Check for missing values
missing_values = subset_data.isnull().sum()
missing_values

ram            0
storage        0
screen         4
final_price    0
dtype: int64

### Question 2:  What's the median (50% percentile) for variable `'ram'`?

- 8
- 16
- 24
- 32


In [27]:
subset_data['ram'].median()

16.0

### Prepare and split the dataset

* Shuffle the dataset (the filtered one you created above), use seed `42`.
* Split your data in train/val/test sets, with 60%/20%/20% distribution.

In [29]:
# Step 1: Shuffle the dataset using seed 42
data_shuffled = subset_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 2: Split the dataset into 60% train, 20% validation, and 20% test sets
train_df, temp_data = train_test_split(data_shuffled, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_data, test_size=0.5, random_state=42)


In [30]:
# Display the shapes of the datasets
print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

Train shape: (1296, 4)
Validation shape: (432, 4)
Test shape: (432, 4)


### Question 3

* We need to deal with missing values for the column from Q1.
* We have two options: fill it with 0 or with the mean of this variable.
* Try both options. For each, train a linear regression model without regularization using the code from the lessons.
* For computing the mean, use the training only!
* Use the validation dataset to evaluate the models and compare the RMSE of each option.
* Round the RMSE scores to 2 decimal digits using `round(score, 2)`
* Which option gives better RMSE?

Options:

- With 0
- With mean
- Both are equally good

In [34]:
# Function to train a linear regression model and compute RMSE
def train_and_evaluate(train_data, val_data):
    # Define features and target
    X_train = train_data[['ram', 'storage', 'screen']]
    y_train = train_data['final_price']
    
    X_val = val_data[['ram', 'storage', 'screen']]
    y_val = val_data['final_price']
    
    # Train the linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return round(rmse, 2)


In [35]:
# Option 1: Fill missing values with 0
train_df_zero = train_df.fillna(0)
val_df_zero = val_df.fillna(0)

rmse_zero = train_and_evaluate(train_df_zero, val_df_zero)

In [36]:

# Option 2: Fill missing values with the mean
mean_storage = train_df['storage'].mean()
train_df_mean = train_df.fillna(mean_storage)
val_df_mean = val_df.fillna(mean_storage)

rmse_mean = train_and_evaluate(train_df_mean, val_df_mean)

# Output RMSE results
print("RMSE with 0:", rmse_zero)
print("RMSE with mean:", rmse_mean)

RMSE with 0: 675.08
RMSE with mean: 675.42


In [37]:
if rmse_zero < rmse_mean:
    print("Filling with 0 gives better RMSE.")
elif rmse_zero > rmse_mean:
    print("Filling with mean gives better RMSE.")
else:
    print("Both options are equally good.")

Filling with 0 gives better RMSE.


### Question 4

* Now let's train a regularized linear regression.
* For this question, fill the NAs with 0. 
* Try different values of `r` from this list: `[0, 0.01, 0.1, 1, 5, 10, 100]`.
* Use RMSE to evaluate the model on the validation dataset.
* Round the RMSE scores to 2 decimal digits.
* Which `r` gives the best RMSE?

If there are multiple options, select the smallest `r`.

Options:

- 0
- 0.01
- 1
- 10
- 100

In [38]:
from sklearn.linear_model import Ridge

# Fill missing values with 0
train_df_zero = train_df.fillna(0)
val_df_zero = val_df.fillna(0)

# Prepare features and target
X_train = train_df_zero[['ram', 'storage', 'screen']]
y_train = train_df_zero['final_price']

X_val = val_df_zero[['ram', 'storage', 'screen']]
y_val = val_df_zero['final_price']

# Define a function to train Ridge regression and compute RMSE
def evaluate_ridge_model(r):
    model = Ridge(alpha=r)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return round(rmse, 2)


In [40]:
# List of r values to try
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_results = {}

# Evaluate RMSE for each value of r
for r in r_values:
    rmse = evaluate_ridge_model(r)
    rmse_results[r] = rmse
    print(f"RMSE with r={r}: {rmse}")

# Find the best r with the smallest RMSE
best_r = min(rmse_results, key=rmse_results.get)
best_rmse = rmse_results[best_r]




RMSE with r=0: 675.08
RMSE with r=0.01: 675.08
RMSE with r=0.1: 675.08
RMSE with r=1: 675.08
RMSE with r=5: 675.08
RMSE with r=10: 675.08
RMSE with r=100: 675.01


In [41]:
print(f"The best r is {best_r} with RMSE: {best_rmse}")

The best r is 100 with RMSE: 675.01


### Question 5 

* We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
* Try different seed values: `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]`.
* For each seed, do the train/validation/test split with 60%/20%/20% distribution.
* Fill the missing values with 0 and train a model without regularization.
* For each seed, evaluate the model on the validation dataset and collect the RMSE scores. 
* What's the standard deviation of all the scores? To compute the standard deviation, use `np.std`.
* Round the result to 3 decimal digits (`round(std, 3)`)

What's the value of std?

- 19.176
- 29.176
- 39.176
- 49.176

In [42]:
def get_rmse_for_seed(seed):
    # Shuffle and split the data with the given seed
    train_df, temp_df = train_test_split(df, train_size=0.6, random_state=seed)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)
    
    # Fill missing values with 0
    train_df_zero = train_df.fillna(0)
    val_df_zero = val_df.fillna(0)
    
    # Prepare features and target
    X_train = train_df_zero[['ram', 'storage', 'screen']]
    y_train = train_df_zero['final_price']
    
    X_val = val_df_zero[['ram', 'storage', 'screen']]
    y_val = val_df_zero['final_price']
    
    # Train linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict and calculate RMSE
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    return round(rmse, 2)


In [43]:
# List of seeds
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

# Collect RMSE for each seed
for seed in seeds:
    rmse = get_rmse_for_seed(seed)
    rmse_scores.append(rmse)
    print(f"RMSE for seed {seed}: {rmse}")

# Calculate the standard deviation of RMSE scores
std = np.std(rmse_scores)
std_rounded = round(std, 3)

print(f"Standard deviation of RMSE scores: {std_rounded}")


RMSE for seed 0: 594.97
RMSE for seed 1: 566.04
RMSE for seed 2: 634.21
RMSE for seed 3: 571.4
RMSE for seed 4: 618.23
RMSE for seed 5: 534.85
RMSE for seed 6: 604.49
RMSE for seed 7: 666.14
RMSE for seed 8: 586.24
RMSE for seed 9: 622.51
Standard deviation of RMSE scores: 35.825


### Question 6

* Split the dataset like previously, use seed 9.
* Combine train and validation datasets.
* Fill the missing values with 0 and train a model with `r=0.001`. 
* What's the RMSE on the test dataset?

Options:

- 598.60
- 608.60
- 618.60
- 628.60

In [44]:
# Split the data using seed 9
train_df, temp_df = train_test_split(df, train_size=0.6, random_state=9)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=9)

# Combine train and validation sets
combined_df = pd.concat([train_df, val_df])
combined_df

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
105,ASUS VivoBook F1605PA-MB143 Intel Core i7-1137...,New,Asus,VivoBook,Intel Core i7,8,512,SSD,,16.0,No,709.00
1291,Acer Extensa 15 EX215-53G-56MT Intel Core i5-1...,New,Acer,Extensa,Intel Core i5,8,256,SSD,MX 330,15.6,No,689.08
490,Gigabyte AERO 14 OLED BMF-72ESBB4SH Intel Core...,New,Gigabyte,Aero,Intel Core i7,16,1000,SSD,RTX 4050,14.0,No,1969.01
279,Lenovo Chromebook IdeaPad 3 CB 14IGL05 Intel C...,New,Lenovo,IdeaPad,Intel Celeron,8,64,eMMC,,14.0,No,391.00
211,ASUS Chromebook Vibe CX34 Flip CX3401FBA-N9003...,New,Asus,Chromebook,Intel Core i5,8,256,SSD,,14.0,Yes,899.00
...,...,...,...,...,...,...,...,...,...,...,...,...
2049,MSI GS66 Stealth 10UE-260ES Intel Core i7-1087...,Refurbished,MSI,Stealth,Intel Core i7,32,1000,SSD,RTX 3060,15.6,No,1659.24
881,HP ProBook 640 G8 Intel Core i5-1135G7/8GB/256...,New,HP,ProBook,Intel Core i5,8,256,SSD,,14.0,No,764.34
476,ASUS ROG Strix Scar 16 2023 G634JZ-N4004 Intel...,New,Asus,ROG,Intel Core i9,32,1000,SSD,RTX 4080,16.0,No,3099.00
712,Denver Electronics NBD-14105SSDES Intel Celero...,New,Denver,Electronics,Intel Celeron,4,256,SSD,,14.0,No,329.95


In [45]:
# Fill missing values with 0
combined_df_zero = combined_df.fillna(0)
test_df_zero = test_df.fillna(0)

# Prepare features and target for combined (train + val) and test sets
X_train_combined = combined_df_zero[['ram', 'storage', 'screen']]
y_train_combined = combined_df_zero['final_price']

X_test = test_df_zero[['ram', 'storage', 'screen']]
y_test = test_df_zero['final_price']

# Train Ridge regression model with r = 0.001
model = Ridge(alpha=0.001)
model.fit(X_train_combined, y_train_combined)

# Predict on the test set
y_pred = model.predict(X_test)

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse_rounded = round(rmse, 2)

print(f"RMSE on the test set: {rmse_rounded}")


RMSE on the test set: 602.43
