In [69]:
import pandas as pd
import numpy as np

In [70]:
df = pd.read_csv('../data/laptops.csv')
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


### Preparing the dataset   
normalize the names of the columns:

In [71]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

Now, instead of 'Final Price', we have 'final_price'.   
   
Next, use only the following columns:   
   
'ram',   
'storage',   
'screen',   
'final_price'   


In [72]:
new_df = df[['ram', 'storage', 'screen', 'final_price']]

### EDA   
Look at the final_price variable. Does it have a long tail?   

In [73]:
new_df.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


### Question 1   
There's one column with missing values. What is it?   
   
'ram'   
'storage'   
'screen'   
'final_price'   

In [74]:
new_df.isna().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

### Question 2   
What's the median (50% percentile) for variable 'ram'?   
   
8   
16   
24   
32   

In [75]:
median_ram = new_df['ram'].median()
quantile_ram = new_df['ram'].quantile(0.5)
print(median_ram)
print(quantile_ram)

16.0
16.0


### Prepare and split the dataset

* Shuffle the dataset (the filtered one you created above), use seed `42`.
* Split your data in train/val/test sets, with 60%/20%/20% distribution.

Use the same code as in the lectures

In [76]:
# df_shuffled = new_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [82]:
np.random.seed(2)

n = len(new_df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = new_df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [83]:
n

2160

In [84]:
n_train, n_val, n_test

(1296, 432, 432)

In [87]:
len(df_train), len(df_val), len(df_test)

(1296, 432, 432)

### Question 3

* We need to deal with missing values for the column from Q1.
* We have two options: fill it with 0 or with the mean of this variable.
* Try both options. For each, train a linear regression model without regularization using the code from the lessons.
* For computing the mean, use the training only!
* Use the validation dataset to evaluate the models and compare the RMSE of each option.
* Round the RMSE scores to 2 decimal digits using `round(score, 2)`
* Which option gives better RMSE?

Options:

- With 0
- With mean
- Both are equally good

In [88]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ram          2160 non-null   int64  
 1   storage      2160 non-null   int64  
 2   screen       2156 non-null   float64
 3   final_price  2160 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 67.6 KB


In [89]:
new_df.iloc[[0, 10, 3, 5]]

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
10,8,256,15.6,349.0
3,16,1000,15.6,1199.0
5,32,1000,17.3,1699.0


In [90]:
zro_df = new_df
mean_df = new_df

In [91]:
zro_df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [92]:
mean_df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [93]:
#fill null value with zero value
zro_df['screen'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  zro_df['screen'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zro_df['screen'].fillna(0, inplace=True)


In [94]:
#fill null value with mean value
screen_mean = mean_df['screen'].mean()
mean_df['screen'].fillna(screen_mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  mean_df['screen'].fillna(screen_mean, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mean_df['screen'].fillna(screen_mean, inplace=True)


In [118]:
def linear_regression(xi, w0, w):
    n = len(xi)
    pred = w0
    for j in range(n):
        pred = pred + w[j] * xi[j]
    return pred

In [119]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [127]:
#zero

In [120]:
np.random.seed(2)

n = len(zro_df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = new_df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [130]:
# w0 = 10.0 
# w = [2.0, 3.0, 4.0]

w0 = 7.17
w = [0.01, 0.04, 0.002]

X_train = df_train[['ram', 'storage', 'screen']]
y_train = df_train['final_price']

X_val = df_val[['ram', 'storage', 'screen']]
y_val = df_val['final_price']

y_pred = [linear_regression(xi, w0, w) for xi in X_val.values]
score = rmse(y_val, y_pred)
print("RMSE (fill with 0):", round(score, 2))

RMSE (fill with 0): 1552.23


In [126]:
#mean

In [128]:
np.random.seed(2)

n = len(mean_df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = new_df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [131]:
# w0 = 10.0 
# w = [2.0, 3.0, 4.0]

w0 = 7.17
w = [0.01, 0.04, 0.002]

X_train = df_train[['ram', 'storage', 'screen']]
y_train = df_train['final_price']

X_val = df_val[['ram', 'storage', 'screen']]
y_val = df_val['final_price']

y_pred = [linear_regression(xi, w0, w) for xi in X_val.values]
score = rmse(y_val, y_pred)
print("RMSE (fill with mean):", round(score, 2))

RMSE (fill with mean): 1552.23


### Question 4

* Now let's train a regularized linear regression.
* For this question, fill the NAs with 0. 
* Try different values of `r` from this list: `[0, 0.01, 0.1, 1, 5, 10, 100]`.
* Use RMSE to evaluate the model on the validation dataset.
* Round the RMSE scores to 2 decimal digits.
* Which `r` gives the best RMSE?

If there are multiple options, select the smallest `r`.

Options:

- 0
- 0.01
- 1
- 10
- 100

In [134]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]


In [135]:
df_train['screen'].fillna(0, inplace=True)
df_val['screen'].fillna(0, inplace=True)
df_test['screen'].fillna(0, inplace=True)


rs = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = []

for r in rs:
    X_train = df_train[['ram', 'storage', 'screen']]
    y_train = df_train['final_price']

    X_val = df_val[['ram', 'storage', 'screen']]
    y_val = df_val['final_price']

    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    y_pred = [w0 + np.dot(xi, w) for xi in X_val.values]
    score = rmse(y_val, y_pred)
    rmse_scores.append(round(score, 2))

for i, r in enumerate(rs):
    print(f"RMSE (r={r}): {rmse_scores[i]}")

best_rmse = min(rmse_scores)
best_r = rs[rmse_scores.index(best_rmse)]

print(f"Best RMSE: {best_rmse} (r={best_r})")

RMSE (r=0): 576.1
RMSE (r=0.01): 576.1
RMSE (r=0.1): 576.12
RMSE (r=1): 576.3
RMSE (r=5): 577.22
RMSE (r=10): 578.1
RMSE (r=100): 580.89
Best RMSE: 576.1 (r=0)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['screen'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_val['screen'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always be

### Question 5 

* We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
* Try different seed values: `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]`.
* For each seed, do the train/validation/test split with 60%/20%/20% distribution.
* Fill the missing values with 0 and train a model without regularization.
* For each seed, evaluate the model on the validation dataset and collect the RMSE scores. 
* What's the standard deviation of all the scores? To compute the standard deviation, use `np.std`.
* Round the result to 3 decimal digits (`round(std, 3)`)

What's the value of std?

- 19.176
- 29.176
- 39.176
- 49.176

> Note: Standard deviation shows how different the values are.
> If it's low, then all values are approximately the same.
> If it's high, the values are different. 
> If standard deviation of scores is low, then our model is *stable*.

In [133]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

for seed in seeds:
    np.random.seed(seed)
    n = len(new_df)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = new_df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    # Fill missing values with 0
    df_train['screen'].fillna(0, inplace=True)
    df_val['screen'].fillna(0, inplace=True)
    df_test['screen'].fillna(0, inplace=True)

    # Train a linear regression model without regularization
    X_train = df_train[['ram', 'storage', 'screen']]
    y_train = df_train['final_price']

    X_val = df_val[['ram', 'storage', 'screen']]
    y_val = df_val['final_price']

    w0 = 10.0  # example value for w0
    w = [2.0, 3.0, 4.0]  # example values for w

    y_pred = [linear_regression(xi, w0, w) for xi in X_val.values]
    score = rmse(y_val, y_pred)
    rmse_scores.append(score)

std = np.std(rmse_scores)
print("Standard deviation of RMSE scores:", round(std, 3))

Standard deviation of RMSE scores: 44.438


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['screen'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_val['screen'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always be

### Question 6

* Split the dataset like previously, use seed 9.
* Combine train and validation datasets.
* Fill the missing values with 0 and train a model with `r=0.001`. 
* What's the RMSE on the test dataset?

Options:

- 598.60
- 608.60
- 618.60
- 628.60

In [137]:
np.random.seed(9)

n = len(new_df)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = new_df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

# Combine train and validation datasets
df_combined = pd.concat([df_train, df_val])

# Fill missing values with 0
df_combined['screen'].fillna(0, inplace=True)
df_test['screen'].fillna(0, inplace=True)

# Train a regularized linear regression model with r=0.001
X_combined = df_combined[['ram', 'storage', 'screen']]
y_combined = df_combined['final_price']

w0, w = train_linear_regression_reg(X_combined, y_combined, r=0.001)

# Make predictions on the test dataset
X_test = df_test[['ram', 'storage', 'screen']]
y_test = df_test['final_price']

y_pred = [w0 + np.dot(xi, w) for xi in X_test.values]
score = rmse(y_test, y_pred)

print("RMSE on the test dataset:", round(score, 2))

RMSE on the test dataset: 608.61


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_combined['screen'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['screen'].fillna(0, inplace=True)
