In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

In [None]:
df = pd.read_csv('assets\\fridge_price.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df_copy = df.copy().drop(df.iloc[:,-3:], axis=1)
df_copy = df_copy.dropna(subset='Price')
df_copy.info()

In [None]:
df_copy.nunique().sort_values(ascending=False)

In [None]:
df_copy.isna().sum()

In [None]:
df_copy.shape

In [None]:
cat_cols = [col for col in df_copy.columns if pd.api.types.is_object_dtype(df_copy[col]) and col != 'Price']
cat_cols

In [None]:
num_cols = [col for col in df_copy.columns if col not in cat_cols and col != 'Price']
num_cols

In [12]:
X = df_copy.drop('Price', axis=1)
y = df_copy['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
imputer_cat = SimpleImputer(strategy='constant', fill_value='missing')
cat_impute = ColumnTransformer([('', imputer_cat, cat_cols)], remainder='passthrough')

X_train_1 = pd.DataFrame(cat_impute.fit_transform(X_train), columns=cat_impute.get_feature_names_out())
X_test_1 = pd.DataFrame(cat_impute.transform(X_test), columns=cat_impute.get_feature_names_out())

X_train_1.isna().sum()

In [None]:
imputer_num = SimpleImputer(strategy='mean')
num_impute = ColumnTransformer([('', imputer_num, ['remainder__' + col for col in num_cols])], remainder='passthrough')

X_train_2 = pd.DataFrame(num_impute.fit_transform(X_train_1), columns=num_impute.get_feature_names_out())
X_test_2 = pd.DataFrame(num_impute.transform(X_test_1), columns=num_impute.get_feature_names_out())

X_train_2.isna().sum()

In [None]:
scaler = StandardScaler()
num_scale = ColumnTransformer([('', scaler, ['__remainder__' + col for col in num_cols])], remainder='passthrough')

X_train_3 = pd.DataFrame(num_scale.fit_transform(X_train_2), columns=num_scale.get_feature_names_out())
X_test_3 = pd.DataFrame(num_scale.transform(X_test_2), columns=num_scale.get_feature_names_out())

X_train_3.isna().sum()

In [16]:
X_train_3['remainder__remainder____Model'] = X_train_3['remainder__remainder____Model'].astype('category').cat.codes +1
X_test_3['remainder__remainder____Model'] = X_test_3['remainder__remainder____Model'].astype('category').cat.codes +1

In [17]:
ordinal = OrdinalEncoder(categories=[X_train_3['remainder__remainder____Energy Rating'].unique().tolist()])

X_train_3['remainder__remainder____Energy Rating'] = ordinal.fit_transform(X_train_3[['remainder__remainder____Energy Rating']])
X_test_3['remainder__remainder____Energy Rating'] = ordinal.fit_transform(X_test_3[['remainder__remainder____Energy Rating']])

In [None]:
X_test_3.columns

In [None]:
encoder = OneHotEncoder(drop='first')
cat_encode = ColumnTransformer([('cat_encode', encoder, ['remainder__remainder____' + col for col in cat_cols if col != 'Model' and col != 'Energy Rating'])], remainder='passthrough', sparse_threshold=0)

X_train_4 = pd.DataFrame(cat_encode.fit_transform(X_train_3), columns=cat_encode.get_feature_names_out())
X_test_4 = pd.DataFrame(cat_encode.transform(X_test_3), columns=cat_encode.get_feature_names_out())

X_test_4.columns

In [20]:
X_train_4 = X_train_4.astype(float)
X_test_4 = X_test_4.astype(float)

In [21]:
def clean_name(df):
    cols = df.columns
    c_cols = []
    for col in df.columns:
        c_cols.append(col.split('__')[-1])
    return c_cols

In [None]:
X_train_4.columns = clean_name(X_train_4)
X_test_4.columns = clean_name(X_test_4)

X_train_4

In [None]:
model = XGBRegressor(random_state=42, n_estimators=2000, max_depth=80, learning_rate=0.002, subsample=0.5, colsample_bytree=0.8)
eval_set = [(X_train_4, y_train), (X_test_4, y_test)]
model.fit(X_train_4, y_train, eval_set=eval_set ,verbose=1)

In [24]:
y_pred = model.predict(X_test_4)
t_pred = model.predict(X_train_4)

In [None]:
r2_score(y_test, y_pred), r2_score(y_train, t_pred)

In [None]:
mean_absolute_error(y_test, y_pred), mean_absolute_error(y_train, t_pred)

In [None]:
X_train_numpy = X_train_4.to_numpy()
X_train_numpy

In [None]:
X_test_numpy = X_test_4.to_numpy()
X_test_numpy

In [None]:
import tensorflow.keras as tfk

tfk.utils.set_random_seed(42)

model_2 = tfk.Sequential([
    tfk.layers.Dense(128, input_shape=(31,), activation='relu'),
    tfk.layers.Dense(1, activation='linear')
])

model_2.compile(optimizer=tfk.optimizers.Adam(learning_rate=0.1), loss='mean_absolute_error')

In [None]:
r = model_2.fit(X_train_numpy, y_train, epochs=200, validation_data=(X_test_numpy, y_test))

In [None]:
r2_score(y_test, model_2.predict(X_test_numpy))