### Imports

In [1]:
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

### Load Data

In [119]:
df = pd.read_csv('data/laptop_price_wf.csv', encoding = 'latin-1', index_col = [0])
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Price_euros,Weight_fl
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1339.69,1.37
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,898.94,1.34
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,575.0,1.86
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,2537.45,1.83
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1803.6,1.37


### Function for Preprocessing

In [120]:
# function that preprocesses dataframe
def process_dataframe(input_df):
    # screen resolution
    input_df['Touchscreen'] = np.where(input_df['ScreenResolution'].str.contains('Touchscreen', case = False, na = False), 1, 0)
    input_df['ScreenResolution'] = input_df.apply(lambda x: x['ScreenResolution'][-8:], axis = 1)
    input_df['ScreenResolution'] = input_df['ScreenResolution'].str.replace(' ', '')
    # cpu
    input_df['Cpu'] = input_df['Cpu'].str.replace('Core', '')
    input_df['Cpu'] = input_df['Cpu'].str.split().str[:2].str.join(sep = ' ')
    input_df['Cpu_brand'] = input_df['Cpu'].str.split().str[0]
    input_df['Cpu'] = input_df['Cpu'].str.split().str[1:].str.join(sep = ' ')
    # memory
    input_df['ssd'] = np.where(input_df['Memory'].str.contains('ssd|hybrid', case = False, na = False), 1, 0)
    input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1', regex = False)
    input_df['Memory'] = input_df['Memory'].str.replace('1TB', '1024')
    input_df['Memory'] = input_df['Memory'].str.replace('2TB', '2048')
    input_df['Memory'] = input_df['Memory'].str.replace('GB', '')
    input_df['Memory'] = input_df['Memory'].str.split().apply(lambda x: [a for a in x if a.isdigit()])
    input_df['Memory'] = input_df['Memory'].apply(lambda x: [int(a) for a in x])
    input_df['Memory'] = input_df['Memory'].apply(lambda x: sum(x))
    # gpu
    input_df['Gpu'] = input_df['Gpu'].str.split().str[:2].str.join(sep = ' ')
    input_df['Gpu_brand'] = input_df['Gpu'].str.split().str[0]
    input_df['Gpu'] = input_df['Gpu'].str.split().str[1:].str.join(sep = ' ')
    # drop columns
    input_df = input_df.drop(['Product', 'laptop_ID'], axis = 1)
    # drop rows
    counts_col2 = input_df.groupby('Cpu')['Cpu'].transform(len)
    counts_col3 = input_df.groupby('Gpu')['Gpu'].transform(len)
    mask = (counts_col2 > 3) & (counts_col3 > 3)
    return input_df[mask]

### Function for Pipeline

In [121]:
# Data Preprocessing
categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])
numerical_preprocessing = Pipeline([('stdscaler', StandardScaler())])

# Applying Transformer 
preprocess = ColumnTransformer([
    ('categorical_preprocessing', categorical_preprocessing, ['Company', 'TypeName', 'ScreenResolution', 'Cpu', 'Ram', 'Gpu', 'OpSys', 'Cpu_brand', 'Gpu_brand']),
    ('numerical_preprocessing', numerical_preprocessing, ['Inches', 'Memory', 'Weight_fl'])
])

# function that produces performance of pipeline with indicated model
def main(input_df, input_model):
    pipeline = Pipeline([
        ('preprocess', preprocess),
        ('model', input_model)
    ])

    X = df.drop('Price_euros', axis = 1)
    y = df['Price_euros']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 23)

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    print('MSE is: ' + str(mean_squared_error(y_test, y_pred)))
    print('RMSE is: ' + str(sqrt(mean_squared_error(y_test, y_pred))))
    print('MAE is: ' + str(mean_absolute_error(y_test, y_pred)))

In [122]:
df = process_dataframe(df)

In [123]:
# linear regression
model = LinearRegression()
main(df, model)

MSE is: 106138.2510155169
RMSE is: 325.78866004745606
MAE is: 233.05009895723583


In [124]:
# stochastic gradient descent regression
model = SGDRegressor()
main(df, model)

MSE is: 109990.16509906074
RMSE is: 331.6476520330888
MAE is: 234.96288938195812


In [125]:
# K-nearest neighbors regression
model = KNeighborsRegressor()
main(df, model)

MSE is: 103449.16395211112
RMSE is: 321.6351410404515
MAE is: 212.02324074074073


In [126]:
# random forest regression
model = RandomForestRegressor()
main(df, model)

MSE is: 93795.70445364439
RMSE is: 306.26084381396913
MAE is: 204.9315895493627


In [127]:
# xgboost regression
model = GradientBoostingRegressor()
main(df, model)

MSE is: 100511.19437052678
RMSE is: 317.0350049608509
MAE is: 218.506737628709


In [128]:
# support vector machine regression
model = SVR()
main(df, model)

MSE is: 507171.6591860877
RMSE is: 712.1598550789616
MAE is: 510.0649214806689
