In [2]:
import pandas as pd
import numpy as np

In [3]:
laptop = pd.read_csv('laptop_data.csv')

In [4]:
laptop.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [5]:
laptop.isnull().sum()

Unnamed: 0          0
Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64

In [6]:
laptop.drop(columns='Unnamed: 0',inplace=True)

In [7]:
laptop['Ram'] = laptop['Ram'].str.replace('GB','').astype('int32')

In [8]:
laptop['Weight'] = laptop['Weight'].str.replace('kg','').astype('float32')

In [9]:
laptop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1303 non-null   object 
 1   TypeName          1303 non-null   object 
 2   Inches            1303 non-null   float64
 3   ScreenResolution  1303 non-null   object 
 4   Cpu               1303 non-null   object 
 5   Ram               1303 non-null   int32  
 6   Memory            1303 non-null   object 
 7   Gpu               1303 non-null   object 
 8   OpSys             1303 non-null   object 
 9   Weight            1303 non-null   float32
 10  Price             1303 non-null   float64
dtypes: float32(1), float64(2), int32(1), object(7)
memory usage: 101.9+ KB


In [12]:
def convertyr(x):
    for i in x.split(' '):
        if 'x' in i:
            return int(i.split('x')[1])

In [11]:
laptop['x_res'] = laptop['ScreenResolution'].apply(convertxr)

In [13]:
laptop['y_res'] = laptop['ScreenResolution'].apply(convertyr)

In [14]:
laptop['touchscreen'] = laptop['ScreenResolution'].apply(lambda x:1 if 'Touchscreen' in x else 0)

In [15]:
laptop['ipd'] = laptop['ScreenResolution'].apply(lambda x:1 if 'IPS' in x else 0)

In [16]:
laptop.corr()['Price']

Inches         0.068197
Ram            0.743007
Weight         0.210370
Price          1.000000
x_res          0.556529
y_res          0.552809
touchscreen    0.191226
ipd            0.252208
Name: Price, dtype: float64

In [17]:
laptop['ppi'] = (((laptop['x_res']**2+laptop['y_res']**2)**0.5)/laptop['Inches']).astype('float')

In [18]:
laptop.drop(columns=['Inches','ScreenResolution','x_res','y_res'],inplace=True)

In [19]:
laptop['Cpu'] = laptop['Cpu'].apply(lambda x:' '.join(x.split(' ')[0:3]) if 'Intel' in x.split(' ') else ''.join(x.split(' ')[0]))

In [20]:
laptop['Cpu'].unique()

array(['Intel Core i5', 'Intel Core i7', 'AMD', 'Intel Core i3',
       'Intel Core M', 'Intel Atom x5-Z8350', 'Intel Atom x5-Z8300',
       'Intel Celeron Dual', 'Intel Pentium Quad', 'Intel Atom x5-Z8550',
       'Intel Xeon E3-1505M', 'Intel Celeron Quad', 'Intel Xeon E3-1535M',
       'Intel Atom Z8350', 'Intel Pentium Dual', 'Intel Atom X5-Z8350',
       'Samsung'], dtype=object)

In [21]:
def convert_gb(x):
    for i in x.split(' '):
        if 'GB' in i:
            return i.replace('GB','')
        if 'TB' in i:
            return i.replace('TB','')

In [22]:
laptop['Memory'] = laptop['Memory'].apply(convert_gb)

In [23]:
laptop['Gpu'] = laptop['Gpu'].apply(lambda x:x.split(' ')[0])

In [24]:
laptop['OpSys'] = laptop['OpSys'].apply(lambda x:x.split(' ')[0])

## set input/output

In [25]:
x = laptop.drop(columns=['Price'])
y = np.log(laptop['Price'])

In [26]:
x

Unnamed: 0,Company,TypeName,Cpu,Ram,Memory,Gpu,OpSys,Weight,touchscreen,ipd,ppi
0,Apple,Ultrabook,Intel Core i5,8,128,Intel,macOS,1.37,0,1,226.983005
1,Apple,Ultrabook,Intel Core i5,8,128,Intel,macOS,1.34,0,0,127.677940
2,HP,Notebook,Intel Core i5,8,256,Intel,No,1.86,0,0,141.211998
3,Apple,Ultrabook,Intel Core i7,16,512,AMD,macOS,1.83,0,1,220.534624
4,Apple,Ultrabook,Intel Core i5,8,256,Intel,macOS,1.37,0,1,226.983005
...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,2 in 1 Convertible,Intel Core i7,4,128,Intel,Windows,1.80,1,1,157.350512
1299,Lenovo,2 in 1 Convertible,Intel Core i7,16,512,Intel,Windows,1.30,1,1,276.053530
1300,Lenovo,Notebook,Intel Celeron Dual,2,64,Intel,Windows,1.50,0,0,111.935204
1301,HP,Notebook,Intel Core i7,6,1,AMD,Windows,2.19,0,0,100.454670


In [27]:
from sklearn.model_selection import train_test_split

In [28]:
x_train, x_test , y_train , y_test = train_test_split(x,y,test_size=0.15,random_state=2)

In [29]:
x_train

Unnamed: 0,Company,TypeName,Cpu,Ram,Memory,Gpu,OpSys,Weight,touchscreen,ipd,ppi
183,Toshiba,Notebook,Intel Core i5,8,128,Intel,Windows,2.00,0,0,100.454670
699,HP,Notebook,Intel Core i7,4,500,AMD,Windows,2.10,0,1,141.211998
419,Lenovo,Ultrabook,Intel Core i7,8,256,AMD,Windows,1.75,0,1,157.350512
997,Acer,Notebook,Intel Pentium Quad,4,500,Intel,Windows,2.40,0,0,100.454670
502,Lenovo,2 in 1 Convertible,Intel Core i5,8,256,Intel,Windows,1.40,1,1,158.482530
...,...,...,...,...,...,...,...,...,...,...,...
466,Acer,Notebook,Intel Core i3,4,500,Nvidia,Windows,2.20,0,0,100.454670
299,Asus,Ultrabook,Intel Core i7,16,512,Nvidia,Windows,1.63,0,0,141.211998
493,Acer,Notebook,AMD,8,1,AMD,Windows,2.20,0,0,100.454670
527,Lenovo,Notebook,Intel Core i3,8,2,Nvidia,No,2.20,0,0,100.454670


In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error

In [31]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

# Linear Regression

In [38]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,2,5,6])
],remainder='passthrough')

step2 = LinearRegression()

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print('R2 score = ',r2_score(y_test,y_pred))
print('MAE = ',mean_absolute_error(y_test,y_pred))

R2 score =  0.8317334527151818
MAE =  0.20688774627224168


# Ridge Regression

In [268]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,2,5,6])
],remainder='passthrough')

step2 = Ridge()

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print('R2 score = ',r2_score(y_test,y_pred))
print('MAE = ',mean_absolute_error(y_test,y_pred))

R2 score =  0.8294560770137985
MAE =  0.2070821649780677


# Lasso Regression

In [269]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,2,5,6])
],remainder='passthrough')

step2 = Lasso(alpha=0.001)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print('R2 score = ',r2_score(y_test,y_pred))
print('MAE = ',mean_absolute_error(y_test,y_pred))

R2 score =  0.8132955572876386
MAE =  0.21574928820557382


# KNN

In [270]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,2,5,6])
],remainder='passthrough')

step2 = KNeighborsRegressor(n_neighbors=3)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print('R2 score = ',r2_score(y_test,y_pred))
print('MAE = ',mean_absolute_error(y_test,y_pred))

R2 score =  0.8422936027646528
MAE =  0.18885277993447427


# Decision Tree

In [271]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,2,5,6])
],remainder='passthrough')

step2 = DecisionTreeRegressor(max_depth=8)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print('R2 score = ',r2_score(y_test,y_pred))
print('MAE = ',mean_absolute_error(y_test,y_pred))

R2 score =  0.8281711168975594
MAE =  0.19044179094601504


# SVM

In [272]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,2,5,6])
],remainder='passthrough')

step2 = SVR(kernel='rbf',C=10000,epsilon=0.1)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print('R2 score = ',r2_score(y_test,y_pred))
print('MAE = ',mean_absolute_error(y_test,y_pred))

R2 score =  0.8678159975752746
MAE =  0.17954765812142726


# Random Forest

In [279]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,2,5,6])
],remainder='passthrough')

step2 = RandomForestRegressor(n_estimators=100,
                             random_state=3,
                             max_samples=0.5,
                             max_features=0.75,
                             max_depth=15)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print('R2 score = ',r2_score(y_test,y_pred))
print('MAE = ',mean_absolute_error(y_test,y_pred))

R2 score =  0.8823040467025359
MAE =  0.16430780323354652


# Extras Tree

In [275]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,2,5,6])
],remainder='passthrough')

step2 = ExtraTreesRegressor(n_estimators=100,
                             random_state=3,
                             max_samples=0.5,
                             max_features=0.75,
                             max_depth=15)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print('R2 score = ',r2_score(y_test,y_pred))
print('MAE = ',mean_absolute_error(y_test,y_pred))

R2 score =  0.8676063443964341
MAE =  0.16077538597843355


# Ada Boost

In [276]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,2,5,6])
],remainder='passthrough')

step2 = AdaBoostRegressor(n_estimators=15,learning_rate=1.0)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print('R2 score = ',r2_score(y_test,y_pred))
print('MAE = ',mean_absolute_error(y_test,y_pred))

R2 score =  0.7887003604211325
MAE =  0.23363984620351136


# Xgboost

In [277]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,2,5,6])
],remainder='passthrough')

step2 = XGBRegressor(n_estimators=45,max_depth=5,learning_rate=0.5)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print('R2 score = ',r2_score(y_test,y_pred))
print('MAE = ',mean_absolute_error(y_test,y_pred))

R2 score =  0.8881922143121077
MAE =  0.16236223993073223


# Exporting the model

In [278]:
import pickle

In [280]:
pickle.dump(laptop,open('laptop_data_pkl.pkl','wb'))
pickle.dump(pipe,open('pipe.pkl','wb'))

In [1]:
x.iloc[808].to_list()

NameError: name 'x' is not defined

In [302]:
int(np.exp(pipe.predict(np.array(x.iloc[808].to_list()).reshape(1,11))[0]))



99025