In [21]:
import pandas as pd 
import numpy as np
import seaborn as sns
%matplotlib inline

In [22]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv')
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


## Prepare the dataset

In [23]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [24]:
df2 = df[['ram', 'storage', 'screen', 'final_price']]

## Q1 : Missing values

In [25]:
df2.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

## Q2 : Median for RAM

In [26]:
df2['ram'].median()

np.float64(16.0)

## Q3 : Filling NAs

### Split the data into train, test, and validation

In [27]:
np.random.seed(42)

n = len(df2)

n_train = int(0.6*n)
n_test = int(0.2*n)
n_val = n-(n_train+n_test) 

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df2.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [28]:
y_train_orig = df_train.final_price.values
y_val_orig = df_val.final_price.values
y_test_orig = df_test.final_price.values

y_train = np.log1p(df_train.final_price.values)
y_val = np.log1p(df_val.final_price.values)
y_test = np.log1p(df_test.final_price.values)

del df_train['final_price']
del df_val['final_price']
del df_test['final_price']

In [29]:
# function to train the linear regression model 
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [30]:
features_used = ['ram', 'storage', 'screen']

In [31]:
def prepare_X(df, zero_fill:bool):
    df_num = df[features_used]
    if zero_fill:
        df_num = df_num.fillna(0)
        X = df_num.values
    else: 
        mean_fill = df_train.screen.mean()
        df_num['screen'] = df_num['screen'].fillna(mean_fill)
        X = df_num.values
    return X

In [32]:
def rmse(y1, y2):
    error = y2 - y1
    mse = (error ** 2).mean()
    return np.sqrt(mse)

### Method 1

Fill the missing values with 0

In [33]:
X_train1 = prepare_X(df_train, True)
w_01, w1 = train_linear_regression(X_train1, y_train)
X_val1 = prepare_X(df_val, True)
y_pred1 = w_01 + X_val1.dot(w1)

In [34]:
rmse1 = round(rmse(y_val, y_pred1), 2)
rmse1

np.float64(0.43)

### Method 2 

FIll the missing value with mean 

In [35]:
X_train2 = prepare_X(df_train, False)
w_02, w2 = train_linear_regression(X_train2, y_train)
X_val2 = prepare_X(df_val, False)
y_pred2 = w_02 + X_val2.dot(w2)

In [36]:
rmse2 = round(rmse(y_val, y_pred2), 2)
rmse1

np.float64(0.43)

## Q4 : Regularization 

In [37]:
def train_linear_regression_reg(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [38]:
# fill the missing values with 0
X_train4 = prepare_X(df_train, True)
X_val4 = prepare_X(df_val, True)

In [44]:
r_value = []
rmse_value = []
for r_val in [0, 0.01, 0.1, 1, 5, 10, 100]:
    r_value.append(r_val)
    w_04, w4 = train_linear_regression_reg(X_train4, y_train, r=r_val)
    y_pred4 = w_04 + X_val4.dot(w4)
    rmse_val = np.round(rmse(y_val, y_pred4), 2)
    rmse_value.append(rmse_val)
q4_df = pd.DataFrame({'Value_r' : r_value, 'RMSE' : rmse_value})
q4_df

Unnamed: 0,Value_r,RMSE
0,0.0,0.43
1,0.01,0.43
2,0.1,0.43
3,1.0,0.43
4,5.0,0.46
5,10.0,0.51
6,100.0,0.67


## Q5 : RMSE spread

In [50]:
q5_rmse = []

for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(s)
    np.random.shuffle(idx)
    df_q5 = df2.iloc[idx]
    df_trainq5 = df_q5.iloc[:n_train].copy()
    df_valq5 = df_q5.iloc[n_train:n_train+n_val].copy()
    df_testq5 = df_q5.iloc[n_train+n_val:].copy()

    y_train_q5 = df_trainq5.final_price.values
    y_val_q5 = df_valq5.final_price.values
    y_test_q5 = df_testq5.final_price.values

    y_trainq5 = np.log1p(df_trainq5.final_price.values)
    y_valq5 = np.log1p(df_valq5.final_price.values)
    y_testq5 = np.log1p(df_testq5.final_price.values)

    del df_trainq5['final_price']
    del df_valq5['final_price']
    del df_testq5['final_price']

    X_trainq5 = prepare_X(df_trainq5, True)
    w_0q5, wq5 = train_linear_regression(X_trainq5, y_trainq5)
    X_valq5 = prepare_X(df_valq5, True)
    y_predq5 = w_0q5 + X_valq5.dot(wq5)
    rmseq5 = rmse(y_val_q5, y_predq5)
    q5_rmse.append(rmseq5)

print(np.round(np.std(q5_rmse), 3))

35.684


## Q6 : RMSE on Test

In [55]:
np.random.seed(9)
np.random.shuffle(idx)
df_q6 = df[['ram', 'storage', 'screen', 'final_price']].iloc[idx]
df_trainq6 = df_q6.iloc[:n_train].copy()
df_valq6 = df_q6.iloc[n_train:n_train+n_val].copy()
df_testq6 = df_q6.iloc[n_train+n_val:].copy()
df_comb = pd.concat([df_trainq6, df_valq6])

y_comb = df_comb.final_price.values
y_test_q6 = df_testq6.final_price.values

y_comb = np.log1p(df_comb.final_price.values)
y_testq6 = np.log1p(df_testq6.final_price.values)

del df_comb['final_price']
del df_testq6['final_price']

X_comb = prepare_X(df_comb, True)
w_0q6, wq6 = train_linear_regression_reg(X_comb, y_comb, r=0.001)
y_predq6 = w_0q6 + df_testq6.dot(wq6)
rmseq6 = np.round(rmse(y_predq6, y_testq6))
print(rmseq6)


0.0
