### Imports

In [161]:
import pandas as pd
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

### Load Data

In [21]:
df = pd.read_csv('data/laptop_price_wf.csv', encoding = 'latin-1', index_col = [0])
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Price_euros,Weight_fl
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1339.69,1.37
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,898.94,1.34
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,575.0,1.86
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,2537.45,1.83
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1803.6,1.37


### Function for Modeling

In [151]:
# custom transformer that applies functino to DataFrame
class DataframeFunctionTransformer():
    def __init__(self, func):
        self.func = func
    
    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y = None, **fit_params):
        return self

# function that engineers features
def process_dataframe(input_df):
    input_df['Touchscreen'] = np.where(input_df['ScreenResolution'].str.contains('Touchscreen', case = False, na = False), 1, 0)
    input_df['ScreenResolution'] = input_df.apply(lambda x: x['ScreenResolution'][-8:], axis = 1)
    input_df['ScreenResolution'] = input_df['ScreenResolution'].str.replace(' ', '')
    input_df['Cpu'] = input_df['Cpu'].str.split().str[:1].str.join(sep = ' ')
    input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')
    input_df['Gpu'] = input_df['Gpu'].str.split().str[:1].str.join(sep = ' ')
    input_df = input_df.drop(['Product', 'laptop_ID'], axis = 1)
    return input_df

# Data Preprocessing
categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])
numerical_preprocessing = Pipeline([('stdscaler', StandardScaler())])

# Applying Transformer 
preprocess = ColumnTransformer([
    ('categorical_preprocessing', categorical_preprocessing, ['Company', 'TypeName', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys']),
    ('numerical_preprocessing', numerical_preprocessing, ['Inches', 'Weight_fl'])
])

# function that produces performance of pipeline with indicated model
def main(input_df, input_model):
    pipeline = Pipeline([
        ('features', DataframeFunctionTransformer(process_dataframe)),
        ('preprocess', preprocess),
        ('model', input_model)
    ])

    X = df.drop('Price_euros', axis = 1)
    y = df['Price_euros']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    print('MSE is: ' + str(mean_squared_error(y_test, y_pred)))
    print('RMSE is: ' + str(sqrt(mean_squared_error(y_test, y_pred))))
    print('MAE is: ' + str(mean_absolute_error(y_test, y_pred)))

In [152]:
# linear regression
model = LinearRegression()
main(df, model)

MSE is: 84489.24970622444
RMSE is: 290.6703454193847
MAE is: 216.8889500811403


  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')
  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')


In [154]:
# stochastic gradient descent regression
model = SGDRegressor()
main(df, model)

MSE is: 81113.39652638645
RMSE is: 284.8041371300397
MAE is: 219.48312918148105


  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')
  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')


In [156]:
# K-nearest neighbors regression
model = KNeighborsRegressor()
main(df, model)

MSE is: 75631.56536842944
RMSE is: 275.01193677444155
MAE is: 200.88870552147236


  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')
  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')


In [158]:
# random forest regression
model = RandomForestRegressor()
main(df, model)

  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')


MSE is: 64845.674762401424
RMSE is: 254.6481391300581
MAE is: 180.46111406502752


  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')


In [160]:
# xgboost regression
model = GradientBoostingRegressor()
main(df, model)

MSE is: 77241.22692574473
RMSE is: 277.9230593630991
MAE is: 202.03064983546813


  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')
  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')


In [162]:
# support vector machine regression
model = SVR()
main(df, model)

MSE is: 375230.794603882
RMSE is: 612.5608497152606
MAE is: 463.883540142214


  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')
  input_df['Memory'] = input_df['Memory'].str.replace('1.0', '1')
