## LIBS

In [1]:
import pandas as pd
import os
import numpy as np
import yaml
import re
from sklearn.ensemble import GradientBoostingRegressor
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import lazypredict
from lazypredict.Supervised import LazyRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import all_estimators
from sklearn.base import RegressorMixin
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
random.seed(42) 
config_path = os.path.join(os.getcwd(), 'config\config.yaml')

In [2]:
with open(config_path) as f:
    config = yaml.safe_load(f)

In [3]:
df = pd.read_csv(os.path.join(config['SavePath'], config['files'] + ".csv"))
df.shape

(12513, 52)

## TARGET VARIABLE 

In [4]:
def convert_to_euro(price):
    price = price.replace("\u2009", "").replace("\t", "").replace(",", "")
    if pd.isnull(price):
        return np.nan
    if price.startswith("About"):
        price = float(price.split(" ")[1]) + round(random.uniform(0, 9), 2)
    else:
        euro_tag = [p for p in price.split("/") if "€" in p]
        if len(euro_tag) > 0:
            price = float(euro_tag[0].split("€")[1])
        else:
            currency_tag = []
            for c in config['conversion_rates'].keys():
                if c in price:
                    currency_tag.append(c)
            if currency_tag:
                number = float(re.findall("\d+", price.split(currency_tag[0])[1])[0])
                price = config['conversion_rates'][currency_tag[0]] * number 
    return price

In [5]:
df = df[~df['MISC_Price'].str.contains('BTC', na=False)]
# df = df.dropna(subset=['MISC_Price'])
df = df.reset_index(drop=True)
df.shape

(12512, 52)

In [6]:
df_na = df[df['MISC_Price'].isna()]
df = df.dropna(subset=['MISC_Price'])
df.shape

(8438, 52)

In [7]:
df['MISC_Price_Euro'] = df['MISC_Price'].apply(convert_to_euro)

In [8]:
df[['Name','MISC_Price', 'MISC_Price_Euro']].head(10)

Unnamed: 0,Name,MISC_Price,MISC_Price_Euro
0,Acer Chromebook Tab 10,About 330 EUR,335.75
1,Acer Iconia Talk S,About 170 EUR,170.23
2,Acer Liquid Z6 Plus,About 250 EUR,252.48
3,Acer Liquid Z6,About 120 EUR,122.01
4,Acer Iconia Tab 10 A3-A40,About 230 EUR,236.63
5,Acer Liquid X2,About 230 EUR,236.09
7,Acer Liquid Zest Plus,About 200 EUR,208.03
8,Acer Liquid Zest,About 110 EUR,110.78
9,Acer Predator 8,About 350 EUR,353.8
10,Acer Liquid Jade Primo,About 220 EUR,220.27


## Other variables

In [9]:
df.info(
    null_counts=True
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8438 entries, 0 to 12493
Data columns (total 53 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Brand                  8438 non-null   object 
 1   url                    8438 non-null   object 
 2   imgUrl                 8438 non-null   object 
 3   Name                   8438 non-null   object 
 4   NETWORK_Technology     8438 non-null   object 
 5   NETWORK_2G_bands       8438 non-null   object 
 6   NETWORK_3G_bands       6874 non-null   object 
 7   NETWORK_4G_bands       4609 non-null   object 
 8   NETWORK_5G_bands       1045 non-null   object 
 9   NETWORK_GPRS           1837 non-null   object 
 10  NETWORK_EDGE           1829 non-null   object 
 11  NETWORK_Speed          6843 non-null   object 
 12  LAUNCH_Announced       8436 non-null   object 
 13  LAUNCH_Status          8438 non-null   object 
 14  BODY_Dimensions        8434 non-null   object 
 15  BOD

In [10]:
df.drop(['MISC_Price'], axis=1, inplace=True)

### Brand, url and Name

In [11]:
# #label encoding for Brand, drop url, imgUrl, Name
# from sklearn.preprocessing import LabelEncoder
# enc_brand = LabelEncoder()
# df['Brand'] = enc_brand.fit_transform(df['Brand'])
# df = df.drop(['url', 'imgUrl', 'Name'], axis=1)

#### Người ta bảo brand hay bị giá ảo -> test liền target encoding


In [12]:
from sklearn.preprocessing import TargetEncoder
enc_brand = TargetEncoder()
df['Brand'] = enc_brand.fit_transform(np.array(df['Brand']).reshape(-1, 1), df['MISC_Price_Euro'])
df_na['Brand'] = enc_brand.transform(np.array(df_na['Brand']).reshape(-1, 1))
df = df.drop(['url', 'imgUrl', 'Name'], axis=1)
df_na = df_na.drop(['url', 'imgUrl', 'Name'], axis=1)

### Network_2345G

In [13]:
df[['NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands']].head(10)

Unnamed: 0,NETWORK_2G_bands,NETWORK_3G_bands,NETWORK_4G_bands,NETWORK_5G_bands
0,,,,
1,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2,HSDPA 850 / 1900 / 2100,"1, 3, 7, 8, 20",
2,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (dual-SIM model only),HSDPA,LTE (unspecified),
3,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (dual-SIM model only),HSDPA,LTE (unspecified),
4,,,,
5,GSM 850 / 900 / 1800 / 1900,"HSDPA 900 / 1900 / 2100 - Europe, Taiwan",LTE 800 / 1800 / 2100 / 2600 - Europe,
7,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (dual-SIM model only),HSDPA,LTE (unspecified),
8,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (dual-SIM model only),HSDPA 850 / 900 / 1900 / 2100,LTE (unspecified),
9,,,,
10,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2,HSDPA 850 / 900 / 1900 / 2100 - Europe/ Taiwan,LTE 800 / 1800 / 2100 / 2600 - Europe,


In [14]:
print("2G number of unique values: ", df['NETWORK_2G_bands'].nunique())
print("3G number of unique values: ", df['NETWORK_3G_bands'].nunique())
print("4G number of unique values: ", df['NETWORK_4G_bands'].nunique())
print("5G number of unique values: ", df['NETWORK_5G_bands'].nunique())

2G number of unique values:  199
3G number of unique values:  588
4G number of unique values:  1458
5G number of unique values:  402


#### Target Encoding 

In [15]:
from category_encoders import TargetEncoder
enc_network = TargetEncoder()
df[['NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands']] = enc_network.fit_transform(df[['NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands']], df['MISC_Price_Euro'])
df_na[['NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands']] = enc_network.transform(df_na[['NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands']]) 

In [16]:
df[['NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands']].head(10)

Unnamed: 0,NETWORK_2G_bands,NETWORK_3G_bands,NETWORK_4G_bands,NETWORK_5G_bands
0,302.11,140.86,157.74,307.45
1,473.71,203.31,172.1,307.45
2,264.91,483.19,517.84,307.45
3,264.91,483.19,517.84,307.45
4,302.11,140.86,157.74,307.45
5,236.17,312.89,310.73,307.45
7,264.91,483.19,517.84,307.45
8,264.91,375.86,517.84,307.45
9,302.11,140.86,157.74,307.45
10,473.71,310.83,310.73,307.45


### Network GPRS EDGE SPEED

In [17]:
df[['NETWORK_GPRS','NETWORK_EDGE']].head(10)

Unnamed: 0,NETWORK_GPRS,NETWORK_EDGE
0,No,No
1,,
2,Yes,Yes
3,Yes,Yes
4,No,No
5,,
7,Yes,Yes
8,,
9,No,No
10,,


In [18]:
df[['NETWORK_GPRS','NETWORK_EDGE']].nunique()

NETWORK_GPRS    21
NETWORK_EDGE    22
dtype: int64

In [19]:
#using same label encoding for GPRS and EDGE
from sklearn.preprocessing import LabelEncoder
enc_gprs = LabelEncoder()
df['NETWORK_GPRS'] = enc_gprs.fit_transform(df['NETWORK_GPRS'])
df['NETWORK_EDGE'] = enc_gprs.fit_transform(df['NETWORK_EDGE'])
# df_na['NETWORK_GPRS'] = enc_gprs.transform(df_na['NETWORK_GPRS'])
# df_na['NETWORK_EDGE'] = enc_gprs.transform(df_na['NETWORK_EDGE'])

In [20]:
df[['NETWORK_Speed', 'NETWORK_Technology']].head(10)

Unnamed: 0,NETWORK_Speed,NETWORK_Technology
0,,No cellular connectivity
1,"HSPA 42.2/11.5 Mbps, LTE Cat4 150/50 Mbps",GSM / HSPA / LTE
2,"HSPA 42.2/5.76 Mbps, LTE Cat4 150/50 Mbps",GSM / HSPA / LTE
3,"HSPA, LTE",GSM / HSPA / LTE
4,,No cellular connectivity
5,"HSPA 42.2/5.76 Mbps, LTE Cat4 150/50 Mbps",GSM / HSPA / LTE
7,"HSPA, LTE",GSM / HSPA / LTE
8,"HSPA, LTE Cat4 150/50 Mbps",GSM / HSPA / LTE
9,,No cellular connectivity
10,"HSPA 42.2/5.76 Mbps, LTE-A (2CA) Cat6 300/50 Mbps",GSM / HSPA / LTE


In [21]:
df['NETWORK_Technology'].unique()

array(['No cellular connectivity', 'GSM / HSPA / LTE', 'GSM / HSPA',
       'GSM', 'GSM / UMTS', 'GSM / CDMA / HSPA / LTE', 'LTE',
       'CDMA / EVDO', 'GSM / HSPA / EVDO / LTE',
       'GSM / CDMA / HSPA / EVDO / LTE / 5G', 'GSM / HSPA / LTE / 5G',
       'GSM / CDMA / HSPA / EVDO / LTE', 'GSM / CDMA / HSPA / EVDO',
       'GSM / CDMA / HSPA / LTE / 5G', 'GSM / LTE', 'HSPA / LTE',
       'GSM / CDMA / EVDO', 'GSM / CDMA / HSPA / CDMA2000 / LTE / 5G',
       'GSM / CDMA / HSPA / CDMA2000 / LTE', 'GSM / CDMA',
       'GSM / HSPA / EVDO / LTE / 5G', 'LTE / 5G',
       'GSM / CDMA / EVDO / LTE', 'GSM / CDMA / HSPA',
       'CDMA / EVDO / LTE', 'CDMA / HSPA / EVDO / LTE',
       'GSM / UMTS / HSPA', 'GSM / HSPA / EVDO', 'CDMA / LTE',
       'CDMA / CDMA2000', 'CDMA', 'HSPA / EVDO', 'CDMA / HSPA',
       'CDMA / HSPA / EVDO', 'GSM / CDMA2000', 'GSM / UMTS / LTE',
       'CDMA / HSPA / LTE', 'GSM / UMTS / HSPA / LTE', 'HSPA',
       'GSM / CDMA / UMTS / EVDO', 'GSM / HSPA / CDMA2000',
     

In [22]:
df['NETWORK_Speed'].unique()

array([nan, 'HSPA 42.2/11.5 Mbps, LTE Cat4 150/50 Mbps',
       'HSPA 42.2/5.76 Mbps, LTE Cat4 150/50 Mbps', 'HSPA, LTE',
       'HSPA, LTE Cat4 150/50 Mbps',
       'HSPA 42.2/5.76 Mbps, LTE-A (2CA) Cat6 300/50 Mbps',
       'HSPA 42.2/5.76 Mbps', 'HSPA 21.1/5.76 Mbps, LTE Cat4 150/50 Mbps',
       'HSPA', 'HSPA 21.1/5.76 Mbps',
       'HSPA 42.2/5.76 Mbps, LTE-A Cat4 150/50 Mbps',
       'HSPA 7.2/5.76 Mbps', 'HSPA 7.2/0.384 Mbps', 'HSPA 14.4/5.76 Mbps',
       'HSPA 7.2/2 Mbps', 'TD-SCDMA', 'HSPA 14.4/2 Mbps',
       'HSPA 3.6/0.384 Mbps', 'HSPA 42.2/11.1 Mbps, LTE Cat4 150/50 Mbps',
       'LTE', 'No', 'HSPA 42.2/11.5 Mbps, LTE-A (2CA) Cat6 300/50 Mbps',
       'HSPA 42.2/5.76 Mbps, LTE Cat4 150/50 Mbps or LTE-A (2CA) Cat6 300/50 Mbps',
       'HSPA 21.1/5.76 Mbps, LTE', 'HSPA 42.2/11.5 Mbps',
       'HSPA 42.2/5.76 Mbps, LTE Cat3 100/50 Mbps',
       'HSPA 21.1/5.76 Mbps, LTE Cat3 100/50 Mbps',
       'EV-DO Rev.A 3.1 Mbps',
       'HSPA 42.2/11.5 Mbps, LTE Cat7 300/100 Mbps',
   

2 thằng sussy này có vẻ ảnh hưởng tới giá nhìu đó mấy ní => target encoding thui

In [23]:
#target encoding for NETWORK_Technology and NETWORK_Speed
enc_network_tech = TargetEncoder()
enc_network_speed = TargetEncoder()
df[['NETWORK_Technology']] = enc_network_tech.fit_transform(df[['NETWORK_Technology']], df['MISC_Price_Euro'])
df[['NETWORK_Speed']] = enc_network_speed.fit_transform(df[['NETWORK_Speed']], df['MISC_Price_Euro'])
df_na[['NETWORK_Technology']] = enc_network_tech.transform(df_na[['NETWORK_Technology']])
df_na[['NETWORK_Speed']] = enc_network_speed.transform(df_na[['NETWORK_Speed']])

### Launch annouced với status ko liên quan lắm (có liên quan thì có lẽ là kiểu dựa vào đó rồi train riêng theo từng thời kì, nhưng mà mình chưa biết làm thế nào để xử lý nên thôi)

### Body dim, weight, build

In [24]:
df[['BODY_Dimensions','BODY_Weight', 'BODY_Build',]].head(10)

Unnamed: 0,BODY_Dimensions,BODY_Weight,BODY_Build
0,238.3 x 172.2 x 9.9 mm (9.38 x 6.78 x 0.39 in),544.3 g (1.20 lb),
1,191.7 x 101 x 9.4 mm (7.55 x 3.98 x 0.37 in),260 g (9.17 oz),
2,153.8 x 75.6 x 8.5 mm (6.06 x 2.98 x 0.33 in),169 g (5.96 oz),
3,145.5 x 72.5 x 8.5 mm (5.73 x 2.85 x 0.33 in),126 g (4.44 oz),
4,259 x 167 x 8.9 mm (10.20 x 6.57 x 0.35 in),-,
5,153.3 x 78.8 x 8.5 mm (6.04 x 3.10 x 0.33 in),166 g (5.86 oz),
7,154 x 77 x 10 mm (6.06 x 3.03 x 0.39 in),-,
8,145.7 x 71.2 x 8.4 mm (5.74 x 2.80 x 0.33 in),125 g (4.41 oz),
9,217.9 x 127 x 8.6 mm (8.58 x 5.0 x 0.34 in),353.8 g (12.49 oz),
10,156.5 x 75.9 x 8.4 mm (6.16 x 2.99 x 0.33 in),150 g (5.29 oz),


In [25]:
df['BODY_Build'].unique()

array([nan, 'Glass front, plastic back, plastic frame',
       'Glass front, plastic frame, plastic back',
       'Glass front (DragonTrail Pro glass), plastic back, plastic frame',
       'Glass front (Asahi Dragontrail), glass back (Asahi Dragontrail), plastic frame',
       'Glass front (Gorilla Glass 4), glass back (Gorilla Glass 4), aluminum frame',
       'Glass front (Corning-made glass), glass back (Corning-made glass), titanium frame (grade 5)',
       'Glass front (Corning-made glass), glass back (Corning-made glass), aluminum frame',
       'Sapphire crystal front, ceramic/sapphire crystal back, titanium frame',
       'Glass front, ceramic/sapphire crystal back, stainless steel frame',
       'Glass front, ceramic/sapphire crystal back, aluminum frame',
       'Glass front, aluminum back, aluminum frame',
       'Glass front (Corning-made glass), glass back (Corning-made glass), stainless steel frame',
       'Glass front, plastic/sapphire crystal back, aluminum frame',
   

In [26]:
# #return indexes of rows with foldable body
# contains_folded = df['BODY_Build'].str.contains("Unfolded")
# #IF value is True, return index of row
# folded_indices = df[contains_folded].index.tolist()
# #calculate mean of BODY_Dimensions and BODY_Weight for foldable phones 
# mean_body_dimensions = df.iloc[folded_indices]['BODY_Dimensions'].mean()
# #replace foldable phones BODY_Dimensions with mean
# df.loc[folded_indices, 'BODY_Dimensions'] = mean_body_dimensions



In [27]:
def body_dim_volume_calc(dimensions):
    # Extract the numerical values from the string
    if pd.isnull(dimensions) or dimensions == '-' or ("thickness" in dimensions):
        return np.nan
    dims = re.findall(r'(\d+\.?\d*)', dimensions)
    # Convert the strings to floats and calculate the volume (2 decimal places)
    volume = float(dims[0]) * float(dims[1]) * float(dims[2])
    volume = round(volume, 2)
    return volume

dim_test = "238.3 x 172.2 x 9.9 mm (9.38 x 6.78 x 0.39 in)"
print(body_dim_volume_calc(dim_test))

406249.07


In [28]:
df['BODY_Dimensions']

0                                            238.3 x 172.2 x 9.9 mm (9.38 x 6.78 x 0.39 in)
1                                              191.7 x 101 x 9.4 mm (7.55 x 3.98 x 0.37 in)
2                                             153.8 x 75.6 x 8.5 mm (6.06 x 2.98 x 0.33 in)
3                                             145.5 x 72.5 x 8.5 mm (5.73 x 2.85 x 0.33 in)
4                                               259 x 167 x 8.9 mm (10.20 x 6.57 x 0.35 in)
5                                             153.3 x 78.8 x 8.5 mm (6.04 x 3.10 x 0.33 in)
7                                                  154 x 77 x 10 mm (6.06 x 3.03 x 0.39 in)
8                                             145.7 x 71.2 x 8.4 mm (5.74 x 2.80 x 0.33 in)
9                                               217.9 x 127 x 8.6 mm (8.58 x 5.0 x 0.34 in)
10                                            156.5 x 75.9 x 8.4 mm (6.16 x 2.99 x 0.33 in)
11                                              136 x 66.5 x 9.6 mm (5.35 x 2.62

## testing w/ models

In [61]:
#print corr of each feature with price
corr = df.corr()
corr.style.background_gradient(cmap='bwr')

Unnamed: 0,Brand,NETWORK_Technology,NETWORK_2G_bands,NETWORK_3G_bands,NETWORK_4G_bands,NETWORK_5G_bands,NETWORK_Speed,MAIN_CAM_1_Module,SELFIE_CAM_2_Module,MISC_Price_Euro
Brand,1.0,0.11664,0.140184,0.154666,0.331785,0.022365,0.110784,0.009265,0.114159,0.287098
NETWORK_Technology,0.11664,1.0,0.522084,0.528282,0.344667,0.204659,0.737117,0.283774,0.664266,0.175992
NETWORK_2G_bands,0.140184,0.522084,1.0,0.448493,0.211796,0.156475,0.396507,0.375425,0.569737,0.154556
NETWORK_3G_bands,0.154666,0.528282,0.448493,1.0,0.264452,0.186111,0.411265,0.313561,0.571125,0.156953
NETWORK_4G_bands,0.331785,0.344667,0.211796,0.264452,1.0,0.059022,0.325057,0.089607,0.215549,0.531329
NETWORK_5G_bands,0.022365,0.204659,0.156475,0.186111,0.059022,1.0,0.120675,0.065841,0.146734,0.08344
NETWORK_Speed,0.110784,0.737117,0.396507,0.411265,0.325057,0.120675,1.0,0.216874,0.49279,0.220872
MAIN_CAM_1_Module,0.009265,0.283774,0.375425,0.313561,0.089607,0.065841,0.216874,1.0,0.444467,0.040833
SELFIE_CAM_2_Module,0.114159,0.664266,0.569737,0.571125,0.215549,0.146734,0.49279,0.444467,1.0,0.117045
MISC_Price_Euro,0.287098,0.175992,0.154556,0.156953,0.531329,0.08344,0.220872,0.040833,0.117045,1.0


NETWORK_GPRS + NETWORK_EDGE khá cùi pắp

In [30]:
regressors = config['regressors']
#removed regressors are those not in regressors 
removed_regressors = [est[0] for est in all_estimators() if (est[0] not in regressors)]

regressor_list = [
    est
    for est in all_estimators()
    if (issubclass(est[1], RegressorMixin) and (est[0] not in removed_regressors))
]

In [31]:
def lazy_eval(df, variables , target, test_size, seed):
    X = df[variables]
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
    reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None, regressors = regressor_list)
    models, predictions = reg.fit(X_train, X_test, y_train, y_test)
    return models, predictions

In [32]:
models, predictions = lazy_eval(df, config['variables'], 'MISC_Price_Euro', config['test_size'], 0)

'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 10/10 [00:05<00:00,  1.88it/s]


In [33]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GradientBoostingRegressor,0.5,0.5,733.26,0.56
RandomForestRegressor,0.43,0.44,783.0,2.93
KNeighborsRegressor,0.43,0.43,784.76,0.06
ExtraTreesRegressor,0.41,0.41,800.22,1.62
SGDRegressor,0.28,0.28,883.85,0.02
Lasso,0.27,0.28,886.34,0.01
Ridge,0.27,0.28,886.42,0.01
LinearRegression,0.27,0.28,886.43,0.01
ElasticNet,0.26,0.26,894.68,0.04
DecisionTreeRegressor,-0.01,-0.01,1045.65,0.05


In [34]:
predictions

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GradientBoostingRegressor,0.5,0.5,733.26,0.56
RandomForestRegressor,0.43,0.44,783.0,2.93
KNeighborsRegressor,0.43,0.43,784.76,0.06
ExtraTreesRegressor,0.41,0.41,800.22,1.62
SGDRegressor,0.28,0.28,883.85,0.02
Lasso,0.27,0.28,886.34,0.01
Ridge,0.27,0.28,886.42,0.01
LinearRegression,0.27,0.28,886.43,0.01
ElasticNet,0.26,0.26,894.68,0.04
DecisionTreeRegressor,-0.01,-0.01,1045.65,0.05


In [35]:
def target_fill_with_retrain(df, model, variables, target):
    X = df[variables]
    y = df[target]
    model.fit(X, y)
    X_na = df_na[variables]
    y_na = model.predict(X_na)
    df_na[target] = y_na
    df = df.append(df_na)
    return df

model = GradientBoostingRegressor()
df = target_fill_with_retrain(df, model, config['fillna_retrain_variables'], config['target'])
df.shape

(12512, 50)

#### re-train and testing with lazy predict

In [36]:
models_retrain, predictions_retrain = lazy_eval(df, config['variables'], 'MISC_Price_Euro', config['test_size'], 42)
models_retrain

'tuple' object has no attribute '__name__'
Invalid Regressor(s)


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:06<00:00,  1.51it/s]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNeighborsRegressor,0.5,0.5,698.48,0.1
GradientBoostingRegressor,0.49,0.49,702.04,0.71
RandomForestRegressor,0.46,0.46,723.55,3.72
ExtraTreesRegressor,0.46,0.46,727.45,1.94
Ridge,0.31,0.31,822.44,0.01
LinearRegression,0.31,0.31,822.44,0.01
Lasso,0.31,0.31,822.5,0.01
SGDRegressor,0.29,0.29,829.82,0.02
ElasticNet,0.28,0.28,839.53,0.01
DecisionTreeRegressor,0.19,0.19,888.16,0.06


In [37]:
predictions_retrain

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNeighborsRegressor,0.5,0.5,698.48,0.1
GradientBoostingRegressor,0.49,0.49,702.04,0.71
RandomForestRegressor,0.46,0.46,723.55,3.72
ExtraTreesRegressor,0.46,0.46,727.45,1.94
Ridge,0.31,0.31,822.44,0.01
LinearRegression,0.31,0.31,822.44,0.01
Lasso,0.31,0.31,822.5,0.01
SGDRegressor,0.29,0.29,829.82,0.02
ElasticNet,0.28,0.28,839.53,0.01
DecisionTreeRegressor,0.19,0.19,888.16,0.06


In [38]:
df.to_csv(os.path.join(config['SavePath'], config['files'] + "_preprocessed.csv"), index=False)