In [257]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [258]:
df1=pd.read_csv("missing_value_imputation.csv")
df=df1.copy()

In [259]:
df.drop(columns=["hdd","thickness_num","weight_num","weight", 'usb2', 'ppi_type',"everyday_use","performance","vga","multi_card_reader","quality_type","antiglare","fingerprint_sensor","ethernet","hdmi","display_port","usb3"],inplace=True)

In [260]:
df.head()

Unnamed: 0,brand,price,thickness,screen_size,ppi,threads,ram,touch_screen,cores,battery_capacity,...,processor_gen,processor_brand,processor_model,graphics_brand,graphics_capacity,graphics_model,business,gaming,ssd,popularity
0,Acer,25990,medium,14,157,4.0,8.0,0,2.0,45.5,...,11.0,intel,i3,intel,0.0,Integrated,0,0,512.0,popular
1,Wings,34990,medium,15,141,8.0,8.0,0,4.0,17.85,...,11.0,intel,i5,intel,0.0,Integrated,1,0,512.0,regular
2,MSI,49990,medium,15,141,12.0,16.0,0,8.0,51.0,...,12.0,intel,i5,intel,4.0,Integrated,0,0,512.0,regular
3,Acer,79745,thick,15,141,12.0,8.0,0,8.0,86.0,...,13.0,intel,i5,nvidia,6.0,rtx4050,0,1,512.0,regular
4,Acer,35990,medium,15,141,12.0,16.0,0,6.0,36.0,...,5.0,amd,5,intel,0.0,Integrated,0,0,512.0,regular


In [261]:
X=df.drop("price",axis=1)
y=df["price"]
y_transformed=np.log1p(y)

In [284]:
ohe_columns=["brand","graphics_brand","processor_brand","touch_screen","backlit","business","gaming"]
oe_columns=["thickness","typec","screen_size","processor_gen","processor_model","popularity","graphics_model"]
std=["ppi","battery_capacity","ssd","threads","ram","cores","graphics_capacity","battery_cell"]

In [285]:
all_columns=ohe_columns+oe_columns

In [286]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), std),
        ('cat', OrdinalEncoder(), all_columns)#oe_columns),
        #('cat1',OneHotEncoder(drop="first",sparse_output=False),ohe_columns)
    ], 
    remainder='passthrough'
)

In [287]:
transformed_data=preprocessor.fit_transform(X)

In [288]:
transformed_df = pd.DataFrame(transformed_data, columns=preprocessor.get_feature_names_out())

In [289]:
transformed_df.shape

(798, 23)

In [290]:
X_train, X_test, y_train, y_test = train_test_split(transformed_df,y_transformed,test_size=0.2,random_state=42)

In [291]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}


In [292]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=43)
    scores = cross_val_score(model, transformed_df, y_transformed, cv=5, scoring='r2')
    
    output.append(scores.mean())
    
    model.fit(X_train,y_train)
    
    y_pred = model.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [293]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [294]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.886412,10343.568141
5,random forest,0.883824,10942.557228
7,gradient boosting,0.897446,11126.152853
9,xgboost,0.897101,12205.465259
1,svr,0.85246,12706.777508
0,linear_reg,0.842483,13428.40301
2,ridge,0.842333,13447.435784
8,adaboost,0.812071,13811.086438
4,decision tree,0.78556,16269.626775
3,LASSO,0.22169,26834.82439
