In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

import time

%matplotlib inline

In [2]:
df = pd.read_csv("../data/cleaned_data.csv")

In [3]:
columns_to_drop = ["title", "link", "description", "weight", "hdd_gb", "ssd_gb", "graphic_card"]

In [4]:
df.drop(columns=columns_to_drop, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               302 non-null    float64
 1   brand               302 non-null    object 
 2   resolution          302 non-null    object 
 3   screen_size         302 non-null    float64
 4   level_1_model       302 non-null    object 
 5   processor           302 non-null    object 
 6   processor_speed     302 non-null    float64
 7   ram                 302 non-null    float64
 8   warranty            302 non-null    int64  
 9   hdd_exist           302 non-null    int64  
 10  ssd_exist           302 non-null    int64  
 11  graphic_card_exist  302 non-null    int64  
 12  storage             302 non-null    float64
dtypes: float64(5), int64(4), object(4)
memory usage: 30.8+ KB


In [6]:
encoded_df = df.copy()
encoded_df.head(10)

Unnamed: 0,price,brand,resolution,screen_size,level_1_model,processor,processor_speed,ram,warranty,hdd_exist,ssd_exist,graphic_card_exist,storage
0,1230.0,apple,wqxga,13.3,macbook pro,intel core i5-4278u,2.6,16.0,1,0,1,0,256.0
1,630.0,dell,fhd,14.0,latitude,intel core i5-6300u,2.4,8.0,1,0,1,0,256.0
2,370.0,dell,hd+,14.0,latitude,intel core i5-4300u,1.9,8.0,1,1,0,0,500.0
3,310.0,dell,sxga,14.0,inpiron,intel core i5-450m,2.4,4.0,1,1,0,0,500.0
4,395.0,dell,sxga,14.0,inpiron,intel core i5-4210u,1.7,4.0,1,1,0,0,500.0
5,315.0,dell,hd,14.0,vostro,intel core i5-3230m,2.6,4.0,1,1,0,0,500.0
6,310.0,lenovo,hd,14.0,thinkpad,intel core i5-3320m,2.6,4.0,1,0,0,0,0.0
7,420.0,dell,hd,13.3,latitude,intel core i5-5200u,2.2,8.0,0,0,1,1,500.0
8,170.0,apple,hd,13.0,macbook air,intel core i5-5350u,1.8,8.0,0,0,0,0,0.0
9,1472.12,dell,hd,14.0,latitude,intel core i7-4650u,2.0,16.0,0,0,1,0,1024.0


## One-Hot Encoding

In [7]:
columns_to_encode = ["brand", "level_1_model"]
for column_name in columns_to_encode:
    dummies = pd.get_dummies(encoded_df[column_name]).rename(columns=lambda x: column_name + "_" + str(x))
    encoded_df = pd.concat([encoded_df, dummies], axis=1)
    encoded_df = encoded_df.drop(column_name, 1)
encoded_df.head(10)

Unnamed: 0,price,resolution,screen_size,processor,processor_speed,ram,warranty,hdd_exist,ssd_exist,graphic_card_exist,...,level_1_model_travelmate,level_1_model_travelmate p248-m,level_1_model_vivobook,level_1_model_vostro,level_1_model_x201,level_1_model_xg15-v2,level_1_model_xps,level_1_model_yoga,level_1_model_zbook,level_1_model_zenbook
0,1230.0,wqxga,13.3,intel core i5-4278u,2.6,16.0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,630.0,fhd,14.0,intel core i5-6300u,2.4,8.0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,370.0,hd+,14.0,intel core i5-4300u,1.9,8.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,310.0,sxga,14.0,intel core i5-450m,2.4,4.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,395.0,sxga,14.0,intel core i5-4210u,1.7,4.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,315.0,hd,14.0,intel core i5-3230m,2.6,4.0,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
6,310.0,hd,14.0,intel core i5-3320m,2.6,4.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,420.0,hd,13.3,intel core i5-5200u,2.2,8.0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
8,170.0,hd,13.0,intel core i5-5350u,1.8,8.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1472.12,hd,14.0,intel core i7-4650u,2.0,16.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Label Encoding

In [None]:
encoded_df["resolution"].unique()

In [None]:
["hd"] = 4
["wqxga"] = 
["fhd"] = 
["hd+"] = 5
["sxga"] = 3
["wxga+"] = 2
["qhd+"] = 
["xga"] = 1
["vga"] = 0

#wqxga, Widescreen Quad Extended Graphics Array (WQXGA): 2560 x 1600
#fhd, Full High Definition (1080p): 1920 x 1080 
#hd+,High Definition Plus (900p): 1600 x 900
#sxga,Super Extended Graphics Array (SXGA): 1280 x 1024 
#wxga+,Widescreen Extended Graphics Array Plus (WXGA+): 1440 x 900
#qhd+, : 3200 x 1800
#xga: 1024 x 768
#vga: 800 x 600

In [8]:
columns_with_cont_values = ["screen_size", "processor_speed", "ram", "storage"]

## Min-Max Normalization

In [9]:
min_max_df = encoded_df.copy()
min_max_df[columns_with_cont_values] = \
    (min_max_df[columns_with_cont_values] - min_max_df[columns_with_cont_values].min()) / \
    (min_max_df[columns_with_cont_values].max() - min_max_df[columns_with_cont_values].min())
min_max_df.head(10)

Unnamed: 0,price,resolution,screen_size,processor,processor_speed,ram,warranty,hdd_exist,ssd_exist,graphic_card_exist,...,level_1_model_travelmate,level_1_model_travelmate p248-m,level_1_model_vivobook,level_1_model_vostro,level_1_model_x201,level_1_model_xg15-v2,level_1_model_xps,level_1_model_yoga,level_1_model_zbook,level_1_model_zenbook
0,1230.0,wqxga,0.402985,intel core i5-4278u,0.538462,0.249634,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,630.0,fhd,0.507463,intel core i5-6300u,0.494505,0.124573,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,370.0,hd+,0.507463,intel core i5-4300u,0.384615,0.124573,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,310.0,sxga,0.507463,intel core i5-450m,0.494505,0.062042,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,395.0,sxga,0.507463,intel core i5-4210u,0.340659,0.062042,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,315.0,hd,0.507463,intel core i5-3230m,0.538462,0.062042,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
6,310.0,hd,0.507463,intel core i5-3320m,0.538462,0.062042,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,420.0,hd,0.402985,intel core i5-5200u,0.450549,0.124573,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
8,170.0,hd,0.358209,intel core i5-5350u,0.362637,0.124573,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1472.12,hd,0.507463,intel core i7-4650u,0.406593,0.249634,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Z-Scoring

In [10]:
z_scored_df = encoded_df.copy()
z_scored_df[columns_with_cont_values] = \
    (z_scored_df[columns_with_cont_values] - z_scored_df[columns_with_cont_values].mean()) / z_scored_df[columns_with_cont_values].std()
z_scored_df.head(10)

Unnamed: 0,price,resolution,screen_size,processor,processor_speed,ram,warranty,hdd_exist,ssd_exist,graphic_card_exist,...,level_1_model_travelmate,level_1_model_travelmate p248-m,level_1_model_vivobook,level_1_model_vostro,level_1_model_x201,level_1_model_xg15-v2,level_1_model_xps,level_1_model_yoga,level_1_model_zbook,level_1_model_zenbook
0,1230.0,wqxga,-0.509677,intel core i5-4278u,0.606441,1.102829,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,630.0,fhd,0.092928,intel core i5-6300u,0.274446,-0.169528,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,370.0,hd+,0.092928,intel core i5-4300u,-0.555543,-0.169528,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,310.0,sxga,0.092928,intel core i5-450m,0.274446,-0.805707,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,395.0,sxga,0.092928,intel core i5-4210u,-0.887538,-0.805707,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,315.0,hd,0.092928,intel core i5-3230m,0.606441,-0.805707,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
6,310.0,hd,0.092928,intel core i5-3320m,0.606441,-0.805707,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,420.0,hd,-0.509677,intel core i5-5200u,-0.05755,-0.169528,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
8,170.0,hd,-0.767936,intel core i5-5350u,-0.72154,-0.169528,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1472.12,hd,0.092928,intel core i7-4650u,-0.389545,1.102829,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Data Analysis

### Models

In [11]:
# Polynomial Regression
def poly_reg(data, deg, cv):
    start = time.time()
    
    data_input = data.drop("price" ,axis=1)
    data_output = data["price"]
    x_train, x_test, y_train, y_test = train_test_split(data_input, data_output, test_size=0.33, random_state=42)

    #model_poly_reg = PolynomialFeatures(degree=deg)
    model_poly_reg = Pipeline([('polynomial',PolynomialFeatures(degree=deg)),('modal',LinearRegression())])
    model_poly_reg.fit(x_train, y_train)
    y_pred_l = model_poly_reg.predict(x_test)
    
    mae_l = mean_absolute_error(y_test, y_pred_l)
    print("\nMAE for Poly Regression degree %d is: %.0f" % (deg, mae_l))
    #print("For the train set: %.0f" %mae_l_train)

    cdf_l = r2_score(y_test, y_pred_l)
    print('R-squared for Poly Regression degree %d: %.2f' % (deg, cdf_l))
    
    if (cv == 1):
        cv = ShuffleSplit(n_splits=3, test_size=0.33, random_state=42)
        # model_poly_reg_cv = PolynomialFeatures(degree=deg)
        model_poly_reg_cv = Pipeline([('polynomial',PolynomialFeatures(degree=deg)),('modal',LinearRegression())])
        scores_lr = cross_val_score(model_poly_reg_cv, data_input, data_output, cv=cv, scoring='neg_mean_absolute_error')
        scores_lr = - scores_lr
        print(scores_lr)
        print("CV MAE degree %d : %0.2f (+/- %0.2f)" % (deg, scores_lr.mean(), scores_lr.std() * 2))
    
    
    print('PR Time = %.2f'%(time.time() - start))

In [12]:
# Random Forest
def random_f(data, version, cv):
    start = time.time()
    
    data_input = data.drop("price", axis=1)
    data_output = data["price"]
    x_train, x_test, y_train, y_test = train_test_split(data_input, data_output, test_size=0.33, random_state=42)

    model_Forest = RandomForestRegressor()
    model_Forest.fit(x_train, y_train)
    y_pred_f = model_Forest.predict(x_test)
    #y_pred_f_train = model_Forest.predict(x_train)
    
    mae_f = mean_absolute_error(y_test, y_pred_f)
    #mae_f_train = mean_absolute_error(y_train, y_pred_f_train)
    print("\nMAE for Random Forest is: %.0f"%mae_f)
    #print("For the train set: %.0f" %mae_f_train)
    
    cdf_f = r2_score(y_test, y_pred_f)
    print('R-squared for Random Forest: %.2f' % cdf_f)

    if (version == 1):
        importances = model_Forest.feature_importances_
        indices = np.argsort(importances)[::-1]
        columns = np.array(list(data_input))
        return importances
        
        # Print the feature ranking
        print("\nFeature ranking:")
        
        for f in range(x_train.shape[1]):
            print("%d. %s (%f)" % (f + 1, columns[indices[f]], importances[indices[f]]))
        
    if (cv == 1):
        cv = ShuffleSplit(n_splits=3, test_size=0.33, random_state=42)
        scores_rf = cross_val_score(model_Forest, data_input, data_output, cv=cv, scoring='neg_mean_absolute_error')
        scores_rf = - scores_rf
        print(scores_rf)
        print("CV MAE: %0.2f (+/- %0.2f)" % (scores_rf.mean(), scores_rf.std() * 2))
    
    print('RF Time = %.2f'%(time.time() - start))

### Analysis

In [13]:
min_max_df.drop(columns=["processor", "resolution"], inplace=True)
z_scored_df.drop(columns=["processor", "resolution"], inplace=True)

In [14]:
poly_reg(data=min_max_df, deg=1, cv=1)
poly_reg(data=min_max_df, deg=2, cv=1)
poly_reg(data=min_max_df, deg=3, cv=1)
random_f(data=min_max_df, version=0, cv=1)


MAE for Poly Regression degree 1 is: 39695977033959
R-squared for Poly Regression degree 1: -61789877709215981633536.00
[3.96959770e+13 2.39797276e+12 2.45979382e+14]
CV MAE degree 1 : 96024443846544.12 (+/- 214243772880489.81)
PR Time = 0.15

MAE for Poly Regression degree 2 is: 32600314829431
R-squared for Poly Regression degree 2: -23896006805820811509760.00
[3.26003148e+13 7.58627817e+12 5.58281821e+12]
CV MAE degree 2 : 15256470403107.15 (+/- 24582387601754.86)
PR Time = 0.60

MAE for Poly Regression degree 3 is: 664706180878742
R-squared for Poly Regression degree 3: -9472483239265029429657600.00
[6.64706181e+14 3.65366827e+14 2.82215480e+14]
CV MAE degree 3 : 437429496208702.06 (+/- 328509990385614.06)
PR Time = 12.30

MAE for Random Forest is: 232
R-squared for Random Forest: 0.61
[243.03591431 296.0250281  229.16382568]
CV MAE: 256.07 (+/- 57.62)
RF Time = 0.76


In [15]:
poly_reg(data=min_max_df, deg=1, cv=0)
poly_reg(data=min_max_df, deg=2, cv=0)
poly_reg(data=min_max_df, deg=3, cv=0)
random_f(data=min_max_df, version=0, cv=0)


MAE for Poly Regression degree 1 is: 39695977033959
R-squared for Poly Regression degree 1: -61789877709215981633536.00
PR Time = 0.03

MAE for Poly Regression degree 2 is: 32600314829431
R-squared for Poly Regression degree 2: -23896006805820811509760.00
PR Time = 0.14

MAE for Poly Regression degree 3 is: 664706180878742
R-squared for Poly Regression degree 3: -9472483239265029429657600.00
PR Time = 3.02

MAE for Random Forest is: 240
R-squared for Random Forest: 0.57
RF Time = 0.24
