In [1]:
import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

import time

In [2]:
df = pd.read_csv("../data/cleaned_data.csv")

In [3]:
columns_to_drop = ["title", "link", "description", "weight", "processor", "hdd_gb", "ssd_gb", "graphic_card"]
df.drop(columns=columns_to_drop, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               302 non-null    float64
 1   brand               302 non-null    object 
 2   resolution          302 non-null    object 
 3   screen_size         302 non-null    float64
 4   level_1_model       302 non-null    object 
 5   processor_speed     302 non-null    float64
 6   ram                 302 non-null    float64
 7   warranty            302 non-null    int64  
 8   hdd_exist           302 non-null    int64  
 9   ssd_exist           302 non-null    int64  
 10  graphic_card_exist  302 non-null    int64  
 11  storage             302 non-null    float64
dtypes: float64(5), int64(4), object(3)
memory usage: 28.4+ KB


In [4]:
df.head(10)

Unnamed: 0,price,brand,resolution,screen_size,level_1_model,processor_speed,ram,warranty,hdd_exist,ssd_exist,graphic_card_exist,storage
0,1230.0,apple,wqxga,13.3,macbook pro,2.6,16.0,1,0,1,0,256.0
1,630.0,dell,fhd,14.0,latitude,2.4,8.0,1,0,1,0,256.0
2,370.0,dell,hd+,14.0,latitude,1.9,8.0,1,1,0,0,500.0
3,310.0,dell,sxga,14.0,inpiron,2.4,4.0,1,1,0,0,500.0
4,395.0,dell,sxga,14.0,inpiron,1.7,4.0,1,1,0,0,500.0
5,315.0,dell,hd,14.0,vostro,2.6,4.0,1,1,0,0,500.0
6,310.0,lenovo,hd,14.0,thinkpad,2.6,4.0,1,0,0,0,0.0
7,420.0,dell,hd,13.3,latitude,2.2,8.0,0,0,1,1,500.0
8,170.0,apple,hd,13.0,macbook air,1.8,8.0,0,0,0,0,0.0
9,1472.12,dell,hd,14.0,latitude,2.0,16.0,0,0,1,0,1024.0


# Encoding

In [5]:
encoded_df = df.copy()

## One-Hot Encoding

In [6]:
columns_to_encode = ["brand", "level_1_model"]
for column_name in columns_to_encode:
    dummies = pd.get_dummies(encoded_df[column_name]).rename(columns=lambda x: column_name + "_" + str(x))
    encoded_df = pd.concat([encoded_df, dummies], axis=1)
    encoded_df = encoded_df.drop(column_name, 1)

## Label Encoding

In [7]:
label_encoding = {"vga": 1, "xga": 2, "hd": 3, "wxga+": 4, "sxga": 5, "hd+": 6, "fhd": 7, "fhd+": 8, "qhd": 9, 
                 "wqhd": 10, "wqxga": 11, "qwxga+": 12, "qhd+": 13, "uhd": 14}
encoded_df["resolution"].replace(label_encoding, inplace=True)

In [8]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 81 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   price                            302 non-null    float64
 1   resolution                       302 non-null    int64  
 2   screen_size                      302 non-null    float64
 3   processor_speed                  302 non-null    float64
 4   ram                              302 non-null    float64
 5   warranty                         302 non-null    int64  
 6   hdd_exist                        302 non-null    int64  
 7   ssd_exist                        302 non-null    int64  
 8   graphic_card_exist               302 non-null    int64  
 9   storage                          302 non-null    float64
 10  brand_acer                       302 non-null    uint8  
 11  brand_aftershock                 302 non-null    uint8  
 12  brand_apple           

In [9]:
encoded_df.head(10)

Unnamed: 0,price,resolution,screen_size,processor_speed,ram,warranty,hdd_exist,ssd_exist,graphic_card_exist,storage,...,level_1_model_travelmate,level_1_model_travelmate p248-m,level_1_model_vivobook,level_1_model_vostro,level_1_model_x201,level_1_model_xg15-v2,level_1_model_xps,level_1_model_yoga,level_1_model_zbook,level_1_model_zenbook
0,1230.0,11,13.3,2.6,16.0,1,0,1,0,256.0,...,0,0,0,0,0,0,0,0,0,0
1,630.0,7,14.0,2.4,8.0,1,0,1,0,256.0,...,0,0,0,0,0,0,0,0,0,0
2,370.0,6,14.0,1.9,8.0,1,1,0,0,500.0,...,0,0,0,0,0,0,0,0,0,0
3,310.0,5,14.0,2.4,4.0,1,1,0,0,500.0,...,0,0,0,0,0,0,0,0,0,0
4,395.0,5,14.0,1.7,4.0,1,1,0,0,500.0,...,0,0,0,0,0,0,0,0,0,0
5,315.0,3,14.0,2.6,4.0,1,1,0,0,500.0,...,0,0,0,1,0,0,0,0,0,0
6,310.0,3,14.0,2.6,4.0,1,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,420.0,3,13.3,2.2,8.0,0,0,1,1,500.0,...,0,0,0,0,0,0,0,0,0,0
8,170.0,3,13.0,1.8,8.0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,1472.12,3,14.0,2.0,16.0,0,0,1,0,1024.0,...,0,0,0,0,0,0,0,0,0,0


# Feature Scaling

In [10]:
columns_with_cont_values = ["screen_size", "processor_speed", "ram", "storage"]

## Min-Max Normalization

In [11]:
min_max_df = encoded_df.copy()
min_max_df[columns_with_cont_values] = \
    (min_max_df[columns_with_cont_values] - min_max_df[columns_with_cont_values].min()) / \
    (min_max_df[columns_with_cont_values].max() - min_max_df[columns_with_cont_values].min())
min_max_df.head(10)

Unnamed: 0,price,resolution,screen_size,processor_speed,ram,warranty,hdd_exist,ssd_exist,graphic_card_exist,storage,...,level_1_model_travelmate,level_1_model_travelmate p248-m,level_1_model_vivobook,level_1_model_vostro,level_1_model_x201,level_1_model_xg15-v2,level_1_model_xps,level_1_model_yoga,level_1_model_zbook,level_1_model_zenbook
0,1230.0,11,0.402985,0.538462,0.249634,1,0,1,0,0.125,...,0,0,0,0,0,0,0,0,0,0
1,630.0,7,0.507463,0.494505,0.124573,1,0,1,0,0.125,...,0,0,0,0,0,0,0,0,0,0
2,370.0,6,0.507463,0.384615,0.124573,1,1,0,0,0.244141,...,0,0,0,0,0,0,0,0,0,0
3,310.0,5,0.507463,0.494505,0.062042,1,1,0,0,0.244141,...,0,0,0,0,0,0,0,0,0,0
4,395.0,5,0.507463,0.340659,0.062042,1,1,0,0,0.244141,...,0,0,0,0,0,0,0,0,0,0
5,315.0,3,0.507463,0.538462,0.062042,1,1,0,0,0.244141,...,0,0,0,1,0,0,0,0,0,0
6,310.0,3,0.507463,0.538462,0.062042,1,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,420.0,3,0.402985,0.450549,0.124573,0,0,1,1,0.244141,...,0,0,0,0,0,0,0,0,0,0
8,170.0,3,0.358209,0.362637,0.124573,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,1472.12,3,0.507463,0.406593,0.249634,0,0,1,0,0.5,...,0,0,0,0,0,0,0,0,0,0


## Z-Scoring

In [12]:
z_scored_df = encoded_df.copy()
z_scored_df[columns_with_cont_values] = \
    (z_scored_df[columns_with_cont_values] - z_scored_df[columns_with_cont_values].mean()) / z_scored_df[columns_with_cont_values].std()
z_scored_df.head(10)

Unnamed: 0,price,resolution,screen_size,processor_speed,ram,warranty,hdd_exist,ssd_exist,graphic_card_exist,storage,...,level_1_model_travelmate,level_1_model_travelmate p248-m,level_1_model_vivobook,level_1_model_vostro,level_1_model_x201,level_1_model_xg15-v2,level_1_model_xps,level_1_model_yoga,level_1_model_zbook,level_1_model_zenbook
0,1230.0,11,-0.509677,0.606441,1.102829,1,0,1,0,-0.381302,...,0,0,0,0,0,0,0,0,0,0
1,630.0,7,0.092928,0.274446,-0.169528,1,0,1,0,-0.381302,...,0,0,0,0,0,0,0,0,0,0
2,370.0,6,0.092928,-0.555543,-0.169528,1,1,0,0,0.347485,...,0,0,0,0,0,0,0,0,0,0
3,310.0,5,0.092928,0.274446,-0.805707,1,1,0,0,0.347485,...,0,0,0,0,0,0,0,0,0,0
4,395.0,5,0.092928,-0.887538,-0.805707,1,1,0,0,0.347485,...,0,0,0,0,0,0,0,0,0,0
5,315.0,3,0.092928,0.606441,-0.805707,1,1,0,0,0.347485,...,0,0,0,1,0,0,0,0,0,0
6,310.0,3,0.092928,0.606441,-0.805707,1,0,0,0,-1.145931,...,0,0,0,0,0,0,0,0,0,0
7,420.0,3,-0.509677,-0.05755,-0.169528,0,0,1,1,0.347485,...,0,0,0,0,0,0,0,0,0,0
8,170.0,3,-0.767936,-0.72154,-0.169528,0,0,0,0,-1.145931,...,0,0,0,0,0,0,0,0,0,0
9,1472.12,3,0.092928,-0.389545,1.102829,0,0,1,0,1.912586,...,0,0,0,0,0,0,0,0,0,0


## Data Analysis

### Models

#### Polynomial Regression

Polynomial regression is a form of regression analysis in which the relationship between the independent variable x and the dependent variable y is modelled as an nth degree polynomial in x.

The polynomial regression model

$y_i = \beta_0 + \beta_1 x_i + \beta_2 x_i^2 + ... + \beta_m x_i^m + \varepsilon_i$ ($i = 1, 2, ..., n$)

can be expressed in matrix form in terms of a data matrix $\mathbf {X} $, a predicted result vector $\vec y$, a parameter vector $\beta$, and a vector $\varepsilon$ of random errors. The $i$-th row of $\mathbf {X}$  and $\vec {y}$ will contain the $x$ and $y$ value for the $i$-th data sample. Then the model can be written as a system of linear equations:

$$
\left[ 
\begin{array}{cccc}
y_1 \\ 
y_2 \\
y_3 \\
\vdots \\
y_n
\end{array} 
\right]
=
\left[ 
\begin{array}{cccc}
1 & x_1 & x_1^2 & \dots & x_1^m \\ 
1 & x_2 & x_2^2 & \dots & x_2^m \\
1 & x_3 & x_3^2 & \dots & x_3^m \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
1 & x_n & x_n^2 & \dots & x_n^m
\end{array} 
\right]
\left[ 
\begin{array}{cccc}
\beta_1 \\ 
\beta_2 \\
\beta_3 \\
\vdots \\
\beta_n
\end{array} 
\right]
+
\left[ 
\begin{array}{cccc}
\varepsilon_1 \\ 
\varepsilon_2 \\
\varepsilon_3 \\
\vdots \\
\varepsilon_n
\end{array} 
\right]
$$

which can be rewritten in the following matrix form:

$\vec{y} = \mathbf{X}\vec{\beta} + \vec{\varepsilon}$.

In [13]:
# Polynomial Regression
def poly_reg(x_train, y_train, x_test, y_test, max_deg=3):
    start = time.time()
    param_grid = {
        'polynomial_features__degree': list(range(1, max_deg)),
        'polynomial_features__include_bias': [True, False],
        'linear_regression__fit_intercept': [True, False],
        'linear_regression__normalize': [True, False]}
    
    polynomial_features = PolynomialFeatures()
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    
    gs_poly = GridSearchCV(pipeline, param_grid, scoring='neg_mean_absolute_error', cv=10)
    gs_poly.fit(x_train, y_train)
    
    best_poly_model = gs_poly.best_estimator_
    y_pred = best_poly_model.predict(x_test)
    mae_poly = mean_absolute_error(y_test, y_pred)
    
    print('Best training score: ', -gs_poly.best_score_)
    print(best_poly_model)
    print("\nMean Absolute Error for Poly Regressor is: %.0f" % mae_poly)
    print('Poly Regressor computation time = %.2fs' % (time.time() - start))

#### Random Forest

Random forests or random decision forests are an ensemble learning method that can be used for regression that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (mean/average prediction) of the individual trees.

The training algorithm for random forests applies the general technique of bootstrap aggregating, or bagging, to tree learners. Given a training set $X = x_1, ..., x_n$ with responses $Y = y_1, ..., y_n$, bagging repeatedly ($B$ times) selects a random sample with replacement of the training set and fits trees to these samples:

For $b = 1, ..., B$:
1. Sample, with replacement, $n$ training examples from $X$, $Y$; call these $X_b$, $Y_b$.
2. Train a classification or regression tree $f_b$ on $X_b$, $Y_b$.

After training, predictions for unseen samples $x'$ can be made by averaging the predictions from all the individual regression trees on $x'$:

$\hat{f} = \frac{1}{B}\sum_{b=1}^{B} f_{b}(x')$

In [14]:
# Random Forest
def random_forest(x_train, y_train, x_test, y_test):
    start = time.time()

    # Number of trees in random forest
    n_estimators = [100, 200, 300, 1000]

    # Number of features to consider at every split
    max_features = [2, 3]
    
    # Maximum number of levels in tree
    max_depth = [80, 90, 100, 110]
    max_depth.append(None)
    
    # Minimum number of samples required to split a node
    min_samples_split = [8, 10, 12]
    
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [3, 4, 5]

    # Method of selecting samples for training each tree
    bootstrap = [True]

    # Create the random grid
    param_grid = {'n_estimators': n_estimators,
                  'max_features': max_features,
                  'max_depth': max_depth,
                  'min_samples_split': min_samples_split,
                  'min_samples_leaf': min_samples_leaf,
                  'bootstrap': bootstrap}
    gs_random_forest = GridSearchCV(RandomForestRegressor(criterion="mae"), param_grid, 
                                    scoring='neg_mean_absolute_error', cv=10)
    gs_random_forest.fit(x_train, y_train)
    
    best_random_forest = gs_random_forest.best_estimator_
    y_pred = best_random_forest.predict(x_test)
    mae_random_forest = mean_absolute_error(y_test, y_pred)
    
    print('Best training score: ', -gs_random_forest.best_score_)
    print(best_random_forest)
    print("\nMean Absolute Error for Random Forest is: %.0f" % mae_random_forest)
    print('Random Forest computation time = %.2fs' % (time.time() - start))

### Analysis

In [15]:
regressors = [poly_reg]
# regressors.append(random_forest)
for regressor in regressors:
    for data in [min_max_df, z_scored_df]:
        data = min_max_df
        data_input = data.drop("price" ,axis=1)
        data_output = data["price"]
        x_train, x_test, y_train, y_test = train_test_split(data_input, data_output, test_size=0.33, random_state=42)
        regressor(x_train, y_train, x_test, y_test)

Best training score:  282.10056256697607
Pipeline(steps=[('polynomial_features',
                 PolynomialFeatures(degree=1, include_bias=False)),
                ('linear_regression',
                 LinearRegression(fit_intercept=False, normalize=True))])

Mean Absolute Error for Poly Regressor is: 249
Poly Regressor computation time = 14.52s
Best training score:  282.10056256697607
Pipeline(steps=[('polynomial_features',
                 PolynomialFeatures(degree=1, include_bias=False)),
                ('linear_regression',
                 LinearRegression(fit_intercept=False, normalize=True))])

Mean Absolute Error for Poly Regressor is: 249
Poly Regressor computation time = 11.41s
