In [28]:
import pandas as pd
import os
import numpy as np
import yaml
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders import CatBoostEncoder
import lazypredict
from lazypredict.Supervised import LazyRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import all_estimators
from sklearn.base import RegressorMixin
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from yellowbrick.regressor import ResidualsPlot
from sklearn.pipeline import Pipeline

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
#random seed
random.seed(42)
config_path = os.path.join(os.getcwd(), 'config\config.yaml')

In [29]:
with open(config_path) as f:
    config = yaml.safe_load(f)

In [30]:
df = pd.read_csv("data_tidy.csv")
df.shape

(12513, 28)

In [31]:
#lowercase if object
df = df.apply(lambda x: x.astype(str).str.lower() if x.dtype == "object" else x)

Drop null những MISC_Price_Euro có giá trị null

In [32]:
#drop na in "MISC_Price_Euro"
df = df.dropna(subset=["MISC_Price_Euro"])
df.shape

(8438, 28)

In [33]:
X = df.drop(["MISC_Price_Euro"], axis=1)
X = X[config['variables']]
y = df["MISC_Price_Euro"]


In [34]:
# # X['resolution'] = DISPLAY_Resolution_Width*DISPLAY_Resolution_Height
# X['resolution'] = X['DISPLAY_Resolution_Width']*X['DISPLAY_Resolution_Height']
# #drop
# X = X.drop(["DISPLAY_Resolution_Width", "DISPLAY_Resolution_Height"], axis=1)

#### Catboost Enc features

In [35]:
cat_enc_cols = config["cat_enc_cols"]
cat_enc_cols

['Brand',
 'NETWORK_Technology',
 'NETWORK_2G_bands',
 'BATTERY_Type',
 'MAIN_CAM_1_Module',
 'MAIN_CAM_1_Video',
 'DISPLAY_Type',
 'PLATFORM_OS',
 'MEMORY_Card_slot']

In [36]:
cat_enc_pipe = Pipeline([
    ('enc', CatBoostEncoder())
])
for col in cat_enc_cols:
    X[col] = cat_enc_pipe.fit_transform(X[col], y)



#### Enc for labels 

In [37]:
# df.FEATURES_Sensors = df.FEATURES_Sensors.str.lower()
# def find_unique_values(series):
#     unique_values = set()
#     for value in series:
#         if isinstance(value, str):  # Only process string values
#             sensors = value.split(', ')
#             for sensor in sensors:
#                 unique_values.add(sensor)
#     return list(unique_values)

# sensors = find_unique_values(df.FEATURES_Sensors)
# sensors = sorted(sensors)
# len(sensors)

In [38]:
# # Split the 'FEATURES_Sensors' column into separate sensors
# df_sensors = df['FEATURES_Sensors'].str.split(', ', expand=True).stack()
# #lowercase

# # Create a new DataFrame with the individual sensors and the corresponding 'MISC_Price_Euro' values
# df_individual_sensors = df.loc[df_sensors.index.get_level_values(0), ['MISC_Price_Euro']].copy()
# df_individual_sensors['Sensor'] = df_sensors.values

# # Calculate the mean 'MISC_Price_Euro' for each sensor
# sensor_means = df_individual_sensors.groupby('Sensor')['MISC_Price_Euro'].mean()

# # Now sensor_means is a Series where the index is the sensor name and the value is the mean 'MISC_Price_Euro'
# sensor_means

In [39]:
def calculate_sensor_means(df, column):
    # Split the column into separate sensors
    df[column] = df[column].str.lower()
    df_sensors = df[column].str.split(', ', expand=True).stack()
    # lowercase all values
    df_sensors = df_sensors.str.lower()
    # Create a new DataFrame with the individual sensors and the corresponding 'MISC_Price_Euro' values
    df_individual_sensors = df.loc[df_sensors.index.get_level_values(0), ['MISC_Price_Euro']].copy()
    df_individual_sensors['Sensor'] = df_sensors.values

    # Calculate the mean 'MISC_Price_Euro' for each sensor
    sensor_means = df_individual_sensors.groupby('Sensor')['MISC_Price_Euro'].mean()

    # Replace the values in the column with the mean 'MISC_Price_Euro' for each sensor
    df[column] = df[column].apply(lambda x: np.mean([sensor_means.get(i, 0) for i in str(x).split(', ')]))
    #fill
    df[column].replace(0, df[column].mean(), inplace=True)
    #add to X
    X[column] = df[column]
    return df

# Usage:
df = calculate_sensor_means(df, 'NETWORK_Speed')
df = calculate_sensor_means(df, 'FEATURES_Sensors')

In [40]:
X.isnull().sum()

Brand                           0
NETWORK_Technology              0
NETWORK_2G_bands                0
NETWORK_Speed                   0
LAUNCH_Announced               11
BODY_Weight                   544
BODY_Length                   197
BODY_Width                    196
BODY_Thickness                167
FEATURES_Sensors                0
DISPLAY_Type                    0
DISPLAY_Size                   90
PLATFORM_OS                     0
MEMORY_Card_slot                0
MEMORY_Internal_rom           561
MEMORY_Internal_ram          1783
BATTERY_Type                    0
BATTERY_Capacity              261
MAIN_CAM_1_Module               0
MAIN_CAM_1_Video                0
DISPLAY_Resolution_Width       13
DISPLAY_Resolution_Height      12
dtype: int64

#### FILLIN

In [41]:
from sklearn.impute import KNNImputer

In [42]:
# Specify the columns to impute
columns_to_impute = ['BODY_Weight', 'BODY_Length', 'BODY_Width', 'BODY_Thickness', 
                     'MEMORY_Internal_rom', 'MEMORY_Internal_ram', 'LAUNCH_Announced', 
                     'DISPLAY_Resolution_Width', 'DISPLAY_Resolution_Height', 'DISPLAY_Size', 'BATTERY_Capacity']

# Create the imputer
imputer = KNNImputer(n_neighbors=3)

# Apply the imputer to the specified columns
X_filled = imputer.fit_transform(X[columns_to_impute])

# Update the original DataFrame with the imputed values
X[columns_to_impute] = X_filled

#### drop outlier

In [43]:
#corr toward y 
corr = X.corrwith(y).sort_values(ascending=False)
corr

Brand                        0.62
DISPLAY_Type                 0.35
MEMORY_Internal_rom          0.35
MAIN_CAM_1_Video             0.33
DISPLAY_Resolution_Width     0.31
NETWORK_Speed                0.29
NETWORK_Technology           0.29
FEATURES_Sensors             0.29
MEMORY_Card_slot             0.29
DISPLAY_Resolution_Height    0.28
PLATFORM_OS                  0.27
DISPLAY_Size                 0.19
MEMORY_Internal_ram          0.19
BATTERY_Capacity             0.18
NETWORK_2G_bands             0.16
BODY_Weight                  0.16
LAUNCH_Announced             0.16
BODY_Width                   0.14
BODY_Length                  0.14
BATTERY_Type                 0.14
MAIN_CAM_1_Module           -0.02
BODY_Thickness              -0.14
dtype: float64

In [44]:
#check skewness
X.skew()
#preprocess for skew

Brand                       15.22
NETWORK_Technology           1.74
NETWORK_2G_bands             0.52
NETWORK_Speed                1.24
LAUNCH_Announced            -0.12
BODY_Weight                  3.78
BODY_Length                  0.90
BODY_Width                   2.64
BODY_Thickness               1.44
FEATURES_Sensors            -0.41
DISPLAY_Type                 1.55
DISPLAY_Size                 0.47
PLATFORM_OS                 26.30
MEMORY_Card_slot             1.82
MEMORY_Internal_rom          2.37
MEMORY_Internal_ram          5.89
BATTERY_Type                 0.30
BATTERY_Capacity             1.44
MAIN_CAM_1_Module           -4.06
MAIN_CAM_1_Video             1.77
DISPLAY_Resolution_Width     0.89
DISPLAY_Resolution_Height    0.27
dtype: float64

In [45]:
#check skewness if high, log transform
for col in X.columns:
    if X[col].skew() > 1:
        X[col] = np.log1p(X[col])

#### test w/ lazy_predict

In [46]:
#standardScaler
from sklearn.preprocessing import StandardScaler

In [47]:
scaler = StandardScaler()
X = scaler.fit_transform(X,y)
y_scaled = scaler.fit_transform(y.values.reshape(-1,1))
X_scaled = pd.DataFrame(X, columns=config['variables'])
X_scaled.head()

Unnamed: 0,Brand,NETWORK_Technology,NETWORK_2G_bands,NETWORK_Speed,LAUNCH_Announced,BODY_Weight,BODY_Length,BODY_Width,BODY_Thickness,FEATURES_Sensors,DISPLAY_Type,DISPLAY_Size,PLATFORM_OS,MEMORY_Card_slot,MEMORY_Internal_rom,MEMORY_Internal_ram,BATTERY_Type,BATTERY_Capacity,MAIN_CAM_1_Module,MAIN_CAM_1_Video,DISPLAY_Resolution_Width,DISPLAY_Resolution_Height
0,-0.07,0.22,0.05,-1.24,0.55,2.66,2.51,3.08,-0.13,-0.02,0.2,2.2,0.41,0.22,0.47,0.61,-0.04,0.9,0.23,0.18,2.03,0.94
1,0.37,0.22,0.05,-0.39,0.14,1.1,1.28,1.26,-0.31,0.01,0.6,0.94,0.41,0.72,0.47,0.31,-0.04,0.51,0.23,0.18,0.04,0.0
2,0.08,-0.03,0.05,0.26,0.14,0.2,0.29,0.28,-0.66,0.29,0.34,0.25,0.14,0.39,0.47,0.48,0.91,0.77,0.23,-0.08,0.92,0.78
3,0.09,0.13,0.22,0.2,0.14,-0.42,0.07,0.13,-0.66,0.01,0.35,0.01,0.31,0.41,0.04,0.01,-0.54,-0.23,0.23,0.18,0.04,0.0
4,-0.12,0.64,0.81,-1.24,0.14,2.11,3.06,2.97,-0.5,-0.02,0.15,2.38,0.09,0.16,0.68,1.21,-0.99,1.33,0.23,0.09,2.96,-0.09


In [48]:
regressors = config['regressors']
#removed regressors are those not in regressors 
removed_regressors = [est[0] for est in all_estimators() if (est[0] not in regressors)]

regressor_list = [
    est
    for est in all_estimators()
    if (issubclass(est[1], RegressorMixin) and (est[0] not in removed_regressors))
]
def lazy_eval(X_train, y_train, X_test, y_test):
    reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None,predictions=False, regressors = regressor_list)
    models, predictions = reg.fit(X_train, X_test, y_train, y_test)
    return models, predictions
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.3, random_state=0)

In [49]:
models, predictions = lazy_eval(X_train, y_train, X_test, y_test)
models

'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 9/9 [00:31<00:00,  3.46s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.53,0.54,0.63,5.91
RandomForestRegressor,0.53,0.53,0.63,17.59
GradientBoostingRegressor,0.47,0.48,0.67,4.31
KNeighborsRegressor,0.46,0.46,0.68,0.4
SVR,0.35,0.36,0.74,1.7
DecisionTreeRegressor,0.33,0.34,0.75,0.26
LinearRegression,0.24,0.24,0.81,0.03
SGDRegressor,0.23,0.23,0.81,0.02
AdaBoostRegressor,-0.37,-0.36,1.08,0.89


In [50]:
from sklearn.preprocessing import PolynomialFeatures
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

#### tuning

#### ExtraTree 

In [51]:
# Define the pipeline
pipe = Pipeline([
    ('transform', PolynomialFeatures()),
    ('reg', ExtraTreesRegressor())
])

# Define the function to optimize
def evaluate(degree, n_estimators):
    pipe.set_params(transform__degree=int(degree), reg__n_estimators=int(n_estimators))
    return np.mean(cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2'))

# Define the bounds of the parameters
param_bounds = {
    'degree': (1, 2),
    'n_estimators': (50, 200),
}

# Create the Bayesian Optimization object
optimizer = BayesianOptimization(
    f=evaluate,
    pbounds=param_bounds,
    random_state=1,
)

# Perform the optimization
optimizer.maximize(init_points=10, n_iter=10)

# Get the best parameters
best_params = optimizer.max['params']
best_params['degree'] = int(best_params['degree'])
best_params['n_estimators'] = int(best_params['n_estimators'])
print(best_params)

|   iter    |  target   |  degree   | n_esti... |
-------------------------------------------------


KeyboardInterrupt: 

In [52]:
# Define the pipeline
pipe = Pipeline([
    ('transform', PolynomialFeatures(1)),
    ('reg', ExtraTreesRegressor(105))
])

# Fit the model
pipe.fit(X_train, y_train)

# Evaluate on the test set
r2_linear = r2_score(y_test, pipe.predict(X_test))
mse_linear = mean_squared_error(y_test, pipe.predict(X_test))
print(f"R^2: {r2_linear:.4f}")
print(f"MSE: {mse_linear:.4f}")

R^2: 0.5463
MSE: 0.3899


# linear regression model

In [53]:
from sklearn.linear_model import LinearRegression

# Define the pipeline
pipe = Pipeline([
    ('transform', PolynomialFeatures()),
    ('reg', LinearRegression())
])

# Define the function to optimize
def evaluate(degree, fit_intercept):
    pipe.set_params(transform__degree=int(degree), reg__fit_intercept=bool(fit_intercept))
    return np.mean(cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2'))

# Define the bounds of the parameters
param_bounds = {
    'degree': (1, 3),
    'fit_intercept': (0, 1),  # 0 for False, 1 for True
}

# Create the Bayesian Optimization object
optimizer = BayesianOptimization(
    f=evaluate,
    pbounds=param_bounds,
    random_state=1,
)

# Perform the optimization
optimizer.maximize(init_points=10, n_iter=8)

# Get the best parameters
best_params = optimizer.max['params']
best_params['degree'] = int(best_params['degree'])
best_params['fit_intercept'] = bool(int(best_params['fit_intercept']))

|   iter    |  target   |  degree   | fit_in... |
-------------------------------------------------
| [0m1        [0m | [0m0.2534   [0m | [0m1.834    [0m | [0m0.7203   [0m |
| [0m2        [0m | [0m0.2534   [0m | [0m1.0      [0m | [0m0.3023   [0m |
| [0m3        [0m | [0m0.2534   [0m | [0m1.294    [0m | [0m0.09234  [0m |
| [0m4        [0m | [0m0.2534   [0m | [0m1.373    [0m | [0m0.3456   [0m |
| [0m5        [0m | [0m0.2534   [0m | [0m1.794    [0m | [0m0.5388   [0m |
| [0m6        [0m | [0m0.2534   [0m | [0m1.838    [0m | [0m0.6852   [0m |
| [0m7        [0m | [0m0.2534   [0m | [0m1.409    [0m | [0m0.8781   [0m |
| [0m8        [0m | [0m0.2534   [0m | [0m1.055    [0m | [0m0.6705   [0m |
| [0m9        [0m | [0m0.2534   [0m | [0m1.835    [0m | [0m0.5587   [0m |
| [0m10       [0m | [0m0.2534   [0m | [0m1.281    [0m | [0m0.1981   [0m |
| [0m11       [0m | [0m-0.07236 [0m | [0m2.994    [0m | [0m0.9939   [0m 

In [54]:
pipe = Pipeline([
    ('transform', PolynomialFeatures(degree=best_params['degree'])),
    ('reg', LinearRegression(fit_intercept=best_params['fit_intercept']))
])

# Fit the model
pipe.fit(X_train, y_train)

# Evaluate on the test set
r2_linear = r2_score(y_test, pipe.predict(X_test))
mse_linear = mean_squared_error(y_test, pipe.predict(X_test))
print(f"R^2: {r2_linear:.4f}")
print(f"MSE: {mse_linear:.4f}")

R^2: 0.2423
MSE: 0.6510


#### KNNREGRESSOR

In [55]:
from sklearn.neighbors import KNeighborsRegressor

# Define the pipeline
pipe = Pipeline([
    ('transform', PolynomialFeatures()),
    ('reg', KNeighborsRegressor())
])

# Define the function to optimize
def evaluate(degree, n_neighbors, p):
    pipe.set_params(transform__degree=int(degree), reg__n_neighbors=int(n_neighbors), reg__p=int(p))
    return np.mean(cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2'))

# Define the bounds of the parameters
param_bounds = {
    'degree': (1, 4),
    'n_neighbors': (1, 10),
    'p': (1, 2),
}

# Create the Bayesian Optimization object
optimizer = BayesianOptimization(
    f=evaluate,
    pbounds=param_bounds,
    random_state=1,
)

# Perform the optimization
optimizer.maximize(init_points=10, n_iter=50)

# Get the best parameters
best_params = optimizer.max['params']
best_params['degree'] = int(best_params['degree'])
best_params['n_neighbors'] = int(best_params['n_neighbors'])
best_params['p'] = int(best_params['p'])

|   iter    |  target   |  degree   | n_neig... |     p     |
-------------------------------------------------------------
| [0m1        [0m | [0m0.3941   [0m | [0m1.834    [0m | [0m7.483    [0m | [0m1.0      [0m |
| [95m2        [0m | [95m0.4487   [0m | [95m1.605    [0m | [95m2.321    [0m | [95m1.092    [0m |
| [0m3        [0m | [0m0.4311   [0m | [0m1.373    [0m | [0m4.11     [0m | [0m1.397    [0m |
| [95m4        [0m | [95m0.4615   [0m | [95m2.078    [0m | [95m4.773    [0m | [95m1.685    [0m |
| [0m5        [0m | [0m0.386    [0m | [0m1.409    [0m | [0m8.903    [0m | [0m1.027    [0m |
| [0m6        [0m | [0m0.4615   [0m | [0m2.341    [0m | [0m4.756    [0m | [0m1.559    [0m |
| [0m7        [0m | [0m0.4487   [0m | [0m1.281    [0m | [0m2.783    [0m | [0m1.801    [0m |
| [0m8        [0m | [0m0.4532   [0m | [0m2.937    [0m | [0m3.821    [0m | [0m1.692    [0m |
| [0m9        [0m | [0m0.3928   [0m | [0m2.75

In [63]:
print(best_params)

{'degree': 1, 'n_neighbors': 4, 'p': 2}


In [64]:
pipe = Pipeline([
    ('transform', PolynomialFeatures(degree=best_params['degree'])),
    ('reg', KNeighborsRegressor(n_neighbors=best_params['n_neighbors'], p=best_params['p']))
])

# Fit the model
pipe.fit(X_train, y_train)

# Evaluate on the test set
r2_knn = r2_score(y_test, pipe.predict(X_test))
mse_knn = mean_squared_error(y_test, pipe.predict(X_test))
print(f"R^2: {r2_knn:.4f}")
print(f"MSE: {mse_knn:.4f}")

R^2: 0.4485
MSE: 0.4738
