In [68]:
from copy import deepcopy
import time
import pandas as pd
import numpy as np
import json
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
raw_train = pd.read_csv("Data/train.csv")
raw_test = pd.read_csv("Data/test.csv")
targets = ["max_price", "delta_abs", "delta_rel", "min_price"]

In [70]:
def shared_reformat(df):
    df = deepcopy(df)
    
    # create special targets (only on train set)
    if "max_price" in list(df):
        df["delta_abs"] = df["max_price"] - df["min_price"] 
        df["delta_rel"] = np.log(df["max_price"] / df["min_price"] )
    
    # deal with missing data
    #df.pixels_y = df.pixels_y.fillna(0)
    df.detachable_keyboard.fillna(0, inplace = True)  # only four cases. Assume none
    df.pixels_x.fillna(1920, inplace = True)   # only 2 cases, replace by most common value
    df.pixels_y.fillna(1080, inplace = True)   # only 2 cases, replace by most common value
    df.loc[df.screen_surface.isna(), "screen_surface"] = 'Glossy'       # 12 cases, large majority is Glossy,
    df.loc[df.screen_surface.isna() & df.pixels_x == 1920, "screen_surface"] = 'Matte'  # except here
    df.sort_values(by=['name'])
    df.cpu_details.fillna(method='bfill', inplace=True)  # take previous value, they are often similar
    df.gpu.fillna(method='bfill', inplace=True)  # take previous value, they are often similar
    df.weight.fillna(method='bfill', inplace=True)
    df.loc[df.os.isna() & df.brand == "Apple", "os"] = 'macOS'  # 1 case
    df.loc[df.os_details.isna() & df.brand == "Apple", "os_details"] = "macOS Mojave"  # 1 case, most common
    df.loc[df.os.isna() & df.brand == "Dell", "os"] = 'Windows'  # 1 case
    df.loc[df.os_details.isna() & df.brand == "Dell", "os_details"] = "Windows 10 Home"  # 1 case, most common
    

    # create new features
    df["cpu_brand"] = df.cpu.str.split(n=1).str[0]
    df["cpu_type"] = df.cpu.str.split(n=1).str[1]
    df["gpu_brand"] = df.gpu.str.split(n=1).str[0]
    df["gpu_series"] = df.gpu.str.split(n=2).str[1]
    df["os_type"] = df.os_details.str.split(n=1).str[1]
    df["os_nr"] = df.os_details.str.split(n=2).str[2]
    df["resolution"] = df.pixels_y / df.screen_size
    df["gimmick"] = df.detachable_keyboard + 5 * df.discrete_gpu + 3* df.touchscreen
    def get_speed(sentence):
        try:
            sentence = sentence.split()
            index = sentence.index('GHz')
            return pow(float(sentence[index-1]),2)
        except:
            return pow(2.4, 2)
    df["speed"] = df.cpu_details.apply(get_speed)
    def has_word(sentence, word):
        try:
            return word in sentence
        except:
            return False
    df["Dual-Core"] = df.cpu_details.apply(has_word, word="Dual-Core")
    df["Quad-Core"] = df.cpu_details.apply(has_word, word="Quad-Core")
    df["Hexa-Core"] = df.cpu_details.apply(has_word, word="Hexa-Core")
    df["Hyper-Threading"] = df.cpu_details.apply(has_word, word="Hyper-Threading")
    
    
    # correct a few mistakes in new features
    map_gpu_series = {"GeFoce":"GeForce", "RadeonÂ": "Radeon"}
    df.gpu_series = df.gpu_series.replace(map_gpu_series) 
    map_screen_surface = {"glossy":"Glossy", "matte":"Matte"}
    df.screen_surface = df.screen_surface.replace(map_screen_surface)  

    
    # make sure all values in categorical variables are strings
    cat_col = df.select_dtypes(include=['object']).columns
    df[cat_col] = df[cat_col].astype(str)
    
    # cleanup test set
    df.pixels_y = df.pixels_y.astype('int64')
    
    # shuffle the database
    df.sample(frac=1)
    return df


In [71]:
df_train = shared_reformat(raw_train)
df_test = shared_reformat(raw_test)

  res_values = method(rvalues)


In [72]:
drop_cols=["id", "name", "base_name", "screen_size", "weight", "detachable_keyboard", "gpu_series",
              "ssd", "pixels_x", "cpu_details", "os_details", "cpu_brand", "cpu", "os"]
df_rf = df_train.drop(columns=drop_cols)
df_rf.head(2)



Unnamed: 0,brand,pixels_y,screen_surface,touchscreen,discrete_gpu,gpu,ram,storage,min_price,max_price,...,gpu_brand,os_type,os_nr,resolution,gimmick,speed,Dual-Core,Quad-Core,Hexa-Core,Hyper-Threading
0,Lenovo,1080,Glossy,1,0,Intel HD,8,1000,899.0,899.0,...,Intel,10,,69.230769,3.0,5.76,True,False,False,True
1,Razer,1080,Matte,0,1,NVIDIA GeForce RTX 2070 Max-Q,16,512,2099.99,2099.99,...,NVIDIA,10 Home,Home,69.230769,5.0,4.84,False,False,True,True


In [73]:
def seppe_error(Y1_true, Y1_pred, Y2_true, Y2_pred):
    return (mean_absolute_error(Y1_true, Y1_pred) + mean_absolute_error(Y2_true, Y2_pred)) / 2

In [74]:
def runRFR(X,y):
    
    rfr = RandomForestRegressor(X,y)
    
    params = dict(
        n_estimators=[100, 300, 500, 1000],
        max_features = [.1, .2, .5, "auto", "log2"],
        min_samples_leaf = [1, 5, 10]
    )
    
    print("Parameters used for tuning:")
    for key in params:
        print("{:20}: {}".format(key, params[key]))
        print("")
  
 

    start_time = time.time()
    print("Start hyper parameters tuning...")
    algorithm = GridSearchCV(rfr, params, scoring=make_scorer(seppe_error, greater_is_better=False), cv=3, n_jobs=-1)
    #algorithm = RandomizedSearchCV(isolation_forest, param_distributions=params, scoring=scorer, cv=5, n_iter=20, n_jobs=-1)
    algorithm.fit(X,y)
    seconds = time.time() - start_time
    minutes, seconds = divmod(seconds, 60)
    print("Hyper parameters tuning done in {:.2f} minutes and {:.2f} secondes".format(minutes, seconds))

    print("{}:\n{}".format("Best parameters found", json.dumps(algorithm.best_params_, indent=2)))
    
    return algorithm.predict(X)

In [75]:
def create_preprocessor(df_pp, categories):
    #global categories
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
        ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        #('ordinal', OrdinalEncoder(categories=categories)),
        #('scaler', StandardScaler())
        ('onehot', OneHotEncoder(categories=categories, handle_unknown="ignore")),   
        ])

    numeric_features = df_pp.select_dtypes(include=['int64', 'float64', 'boolean']).columns
    categorical_features = df_pp.select_dtypes(include=['object']).columns
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    return preprocessor

In [76]:
def preprocessData(df):
    numerics = df.select_dtypes(include=['int32','int64', 'float32', 'float64', 'boolean']).columns
    categoricals = df.select_dtypes(include=['object']).columns
    
    num_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    num_imputer.fit(df[numerics])
    num_scaler = StandardScaler()
    num_scaler.fit(df[numerics])
    
    cat_imputer = SimpleImputer(strategy='constant', fill_value='')
    cat_imputer.fit(df[categoricals])
    cat_ohe = OneHotEncoder(handle_unknown="ignore")
    cat_ohe.fit(df[categoricals])
    return df


df_pp = preprocessData(df_rf)

In [77]:

X = df_pp.drop(columns=["min_price", "max_price"])
y = df_pp[["min_price", "max_price"]]
y.head(2)
X.head(2)#
y_pred = runRFR(X,y)


Parameters used for tuning:
n_estimators        : [100, 300, 500, 1000]

max_features        : [0.1, 0.2, 0.5, 'auto', 'log2']

min_samples_leaf    : [1, 5, 10]

Start hyper parameters tuning...


ValueError: could not convert string to float: 'Lenovo'

In [None]:
df_pp.head(2)
