In [14]:
import pandas as pd

In [15]:
data = pd.read_csv('cleaned_data.csv')
data.head()
data['Price'].median()

989.0

In [16]:
## Split features X, y (independent and dependent features)


X = data.drop(columns=['Price'])
y = data['Price']

In [17]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Company               1275 non-null   object 
 1   TypeName              1275 non-null   object 
 2   Inches                1275 non-null   float64
 3   CPU_Company           1275 non-null   object 
 4   CPU_Frequency (GHz)   1275 non-null   float64
 5   RAM (GB)              1275 non-null   int64  
 6   GPU_Company           1275 non-null   object 
 7   OpSys                 1275 non-null   object 
 8   Weight (kg)           1275 non-null   float64
 9   screen_x_size         1275 non-null   int64  
 10  screen_y_size         1275 non-null   int64  
 11  Touch_screen          1275 non-null   int64  
 12  IPS                   1275 non-null   int64  
 13  Full_HD               1275 non-null   int64  
 14  4k_Ultra_HD           1275 non-null   int64  
 15  Quad_HD              

In [18]:
## Total unique value with type for all columns
for key, value in enumerate(data.columns):
    print(f'{value} - {data[value].dtype} - {len(data[value].value_counts())}')
    

Company - object - 19
TypeName - object - 6
Inches - float64 - 18
CPU_Company - object - 3
CPU_Frequency (GHz) - float64 - 25
RAM (GB) - int64 - 9
GPU_Company - object - 4
OpSys - object - 9
Weight (kg) - float64 - 171
screen_x_size - int64 - 13
screen_y_size - int64 - 10
Touch_screen - int64 - 2
IPS - int64 - 2
Full_HD - int64 - 2
4k_Ultra_HD - int64 - 2
Quad_HD - int64 - 2
Screen_Resolution_NA - int64 - 2
Price - float64 - 791
SSD - int64 - 2
HDD - int64 - 2
Flash_Storage - int64 - 2


In [19]:
## Transformer

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

one_hot  = OneHotEncoder()
ordinal = OrdinalEncoder()
standard = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", one_hot, ['CPU_Company', 'GPU_Company']),
        ("StandardScaler", standard, ['Inches','CPU_Frequency (GHz)','RAM (GB)','Weight (kg)', 'screen_x_size', 'screen_y_size', 'Touch_screen','IPS', 'Full_HD', '4k_Ultra_HD','Quad_HD','Screen_Resolution_NA', 'SSD', 'HDD', 'Flash_Storage']),
        ('OrdinalEncoder', ordinal, ['Company', 'TypeName', 'OpSys'])
    ]
)

''' pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifer', RandomForestRegressor())
    ]
) '''


##pipe.fit(X_train, y_train)

X = preprocessor.fit_transform(X)




In [20]:
## Train test split

from sklearn.model_selection import train_test_split
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [21]:
## Modeling Import

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import warnings
import numpy as np

In [22]:
## create an evaluate function to give all metrics after model training

def evaluate_model(true, predicated):
    mae = mean_absolute_error(true, predicated)
    mse = mean_squared_error(true, predicated)
    rmse = np.sqrt(mean_squared_error(true, predicated))
    r2_square = r2_score(true, predicated)
    return mae, rmse, r2_square

In [23]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random forest regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
   ## "Catboosting Regressor": CatBoostRegressor(verbose=False)
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) ## train model
    
    ## Make prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(x_test)
    
    ## Evaluate train and test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model performance for training set")
    print("Root mean squared error: {:.4f}".format(model_train_mae))
    print("Mean squared error: {:.4f}".format(model_train_rmse))
    print("R2 score: {:.4f}".format(model_train_r2))
    
    print("Model performance for test set")
    print("Root mean squared error: {:.4f}".format(model_test_mae))
    print("Mean squared error: {:.4f}".format(model_test_rmse))
    print("R2 score: {:.4f}".format(model_test_r2))
    
    r2_list.append(model_test_r2)

LinearRegression
Model performance for training set
Root mean squared error: 268.5676
Mean squared error: 367.7113
R2 score: 0.7361
Model performance for test set
Root mean squared error: 287.7969
Mean squared error: 391.4296
R2 score: 0.6191
Lasso
Model performance for training set
Root mean squared error: 268.2774
Mean squared error: 367.8261
R2 score: 0.7359
Model performance for test set
Root mean squared error: 286.8202
Mean squared error: 390.3117
R2 score: 0.6213
Ridge
Model performance for training set
Root mean squared error: 268.5272
Mean squared error: 367.7141
R2 score: 0.7361
Model performance for test set
Root mean squared error: 287.7164
Mean squared error: 391.3201
R2 score: 0.6193
K-Neighbors Regressor
Model performance for training set
Root mean squared error: 202.2556
Mean squared error: 307.4457
R2 score: 0.8155
Model performance for test set
Root mean squared error: 248.2664
Mean squared error: 359.8277
R2 score: 0.6781
Decision Tree
Model performance for training 

In [24]:
## Results

pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2 score']).sort_values(by=['R2 score'], ascending=False)

Unnamed: 0,Model Name,R2 score
6,XGBRegressor,0.82757
5,Random forest regressor,0.823877
3,K-Neighbors Regressor,0.678121
4,Decision Tree,0.661505
1,Lasso,0.621273
2,Ridge,0.619314
0,LinearRegression,0.619101
7,AdaBoostRegressor,0.552833


In [25]:
list(zip(model_list, r2_list))

[('LinearRegression', 0.6191006721815764),
 ('Lasso', 0.6212733208740477),
 ('Ridge', 0.6193138477184013),
 ('K-Neighbors Regressor', 0.6781214318069774),
 ('Decision Tree', 0.6615045070308585),
 ('Random forest regressor', 0.8238765517271921),
 ('XGBRegressor', 0.827569923771812),
 ('AdaBoostRegressor', 0.5528326980271303)]

In [28]:
data['OpSys'].value_counts()

OpSys
Windows 10      1048
No OS             66
Linux             58
Windows 7         45
Chrome OS         27
macOS             13
Mac OS X           8
Windows 10 S       8
Android            2
Name: count, dtype: int64

In [32]:
## Therefore XGboost gives better result
from xgboost import XGBRegressor
XGBRegressor = XGBRegressor()
XGBRegressor.fit(X_train, y_train)

In [33]:
## Create a pickle file

import pickle
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

with open('model.pkl', 'wb') as file:
    pickle.dump(XGBRegressor, file)

In [38]:
data.sample(1)

Unnamed: 0,Company,TypeName,Inches,CPU_Company,CPU_Frequency (GHz),RAM (GB),GPU_Company,OpSys,Weight (kg),screen_x_size,...,Touch_screen,IPS,Full_HD,4k_Ultra_HD,Quad_HD,Screen_Resolution_NA,Price,SSD,HDD,Flash_Storage
97,Dell,Notebook,15.6,Intel,2.0,4,AMD,Linux,2.2,1920,...,0,0,1,0,0,0,485.0,1,0,0


In [41]:
data['OpSys'].value_counts()

OpSys
Windows 10      1048
No OS             66
Linux             58
Windows 7         45
Chrome OS         27
macOS             13
Mac OS X           8
Windows 10 S       8
Android            2
Name: count, dtype: int64

In [None]:
"Windows 10", "No OS", "Linux", "Windows 7", "Chrome OS", "macOS", "Mac OS X", "Windows 10 S", "Android"