In [68]:
from copy import deepcopy
import time
import pandas as pd
import numpy as np
import json
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
raw_train = pd.read_csv("Data/train.csv")
raw_test = pd.read_csv("Data/test.csv")
targets = ["max_price", "delta_abs", "delta_rel", "min_price"]

In [70]:
def shared_reformat(df):
    df = deepcopy(df)
    
    # create special targets (only on train set)
    if "max_price" in list(df):
        df["delta_abs"] = df["max_price"] - df["min_price"] 
        df["delta_rel"] = np.log(df["max_price"] / df["min_price"] )
    
    # deal with missing data
    #df.pixels_y = df.pixels_y.fillna(0)
    df.detachable_keyboard.fillna(0, inplace = True)  # only four cases. Assume none
    df.pixels_x.fillna(1920, inplace = True)   # only 2 cases, replace by most common value
    df.pixels_y.fillna(1080, inplace = True)   # only 2 cases, replace by most common value
    df.loc[df.screen_surface.isna(), "screen_surface"] = 'Glossy'       # 12 cases, large majority is Glossy,
    df.loc[df.screen_surface.isna() & df.pixels_x == 1920, "screen_surface"] = 'Matte'  # except here
    df.sort_values(by=['name'])
    df.cpu_details.fillna(method='bfill', inplace=True)  # take previous value, they are often similar
    df.gpu.fillna(method='bfill', inplace=True)  # take previous value, they are often similar
    df.weight.fillna(method='bfill', inplace=True)
    df.loc[df.os.isna() & df.brand == "Apple", "os"] = 'macOS'  # 1 case
    df.loc[df.os_details.isna() & df.brand == "Apple", "os_details"] = "macOS Mojave"  # 1 case, most common
    df.loc[df.os.isna() & df.brand == "Dell", "os"] = 'Windows'  # 1 case
    df.loc[df.os_details.isna() & df.brand == "Dell", "os_details"] = "Windows 10 Home"  # 1 case, most common
    

    # create new features
    df["cpu_brand"] = df.cpu.str.split(n=1).str[0]
    df["cpu_type"] = df.cpu.str.split(n=1).str[1]
    df["gpu_brand"] = df.gpu.str.split(n=1).str[0]
    df["gpu_series"] = df.gpu.str.split(n=2).str[1]
    df["os_type"] = df.os_details.str.split(n=1).str[1]
    df["os_nr"] = df.os_details.str.split(n=2).str[2]
    df["resolution"] = df.pixels_y / df.screen_size
    df["gimmick"] = df.detachable_keyboard + 5 * df.discrete_gpu + 3* df.touchscreen
    def get_speed(sentence):
        try:
            sentence = sentence.split()
            index = sentence.index('GHz')
            return pow(float(sentence[index-1]),2)
        except:
            return pow(2.4, 2)
    df["speed"] = df.cpu_details.apply(get_speed)
    def has_word(sentence, word):
        try:
            return word in sentence
        except:
            return False
    df["Dual-Core"] = df.cpu_details.apply(has_word, word="Dual-Core")
    df["Quad-Core"] = df.cpu_details.apply(has_word, word="Quad-Core")
    df["Hexa-Core"] = df.cpu_details.apply(has_word, word="Hexa-Core")
    df["Hyper-Threading"] = df.cpu_details.apply(has_word, word="Hyper-Threading")
    
    
    # correct a few mistakes in new features
    map_gpu_series = {"GeFoce":"GeForce", "RadeonÂ": "Radeon"}
    df.gpu_series = df.gpu_series.replace(map_gpu_series) 
    map_screen_surface = {"glossy":"Glossy", "matte":"Matte"}
    df.screen_surface = df.screen_surface.replace(map_screen_surface)  

    
    # make sure all values in categorical variables are strings
    cat_col = df.select_dtypes(include=['object']).columns
    df[cat_col] = df[cat_col].astype(str)
    
    # cleanup test set
    df.pixels_y = df.pixels_y.astype('int64')
    
    # shuffle the database
    df.sample(frac=1)
    return df


In [71]:
df_train = shared_reformat(raw_train)
df_test = shared_reformat(raw_test)

  res_values = method(rvalues)


In [72]:
drop_cols=["id", "name", "base_name", "screen_size", "weight", "detachable_keyboard", "gpu_series",
              "ssd", "pixels_x", "cpu_details", "os_details", "cpu_brand", "cpu", "os"]
df_rf = df_train.drop(columns=drop_cols)
df_rf.head(2)



Unnamed: 0,brand,pixels_y,screen_surface,touchscreen,discrete_gpu,gpu,ram,storage,min_price,max_price,...,gpu_brand,os_type,os_nr,resolution,gimmick,speed,Dual-Core,Quad-Core,Hexa-Core,Hyper-Threading
0,Lenovo,1080,Glossy,1,0,Intel HD,8,1000,899.0,899.0,...,Intel,10,,69.230769,3.0,5.76,True,False,False,True
1,Razer,1080,Matte,0,1,NVIDIA GeForce RTX 2070 Max-Q,16,512,2099.99,2099.99,...,NVIDIA,10 Home,Home,69.230769,5.0,4.84,False,False,True,True


In [73]:
def seppe_error(Y1_true, Y1_pred, Y2_true, Y2_pred):
    return (mean_absolute_error(Y1_true, Y1_pred) + mean_absolute_error(Y2_true, Y2_pred)) / 2

In [74]:
def runRFR(X,y):
    
    rfr = RandomForestRegressor(X,y)
    
    params = dict(
        n_estimators=[100, 300, 500, 1000],
        max_features = [.1, .2, .5, "auto", "log2"],
        min_samples_leaf = [1, 5, 10]
    )
    
    print("Parameters used for tuning:")
    for key in params:
        print("{:20}: {}".format(key, params[key]))
        print("")
  
 

    start_time = time.time()
    print("Start hyper parameters tuning...")
    algorithm = GridSearchCV(rfr, params, scoring=make_scorer(seppe_error, greater_is_better=False), cv=3, n_jobs=-1)
    #algorithm = RandomizedSearchCV(isolation_forest, param_distributions=params, scoring=scorer, cv=5, n_iter=20, n_jobs=-1)
    algorithm.fit(X,y)
    seconds = time.time() - start_time
    minutes, seconds = divmod(seconds, 60)
    print("Hyper parameters tuning done in {:.2f} minutes and {:.2f} secondes".format(minutes, seconds))

    print("{}:\n{}".format("Best parameters found", json.dumps(algorithm.best_params_, indent=2)))
    
    return algorithm.predict(X)

In [75]:
def create_preprocessor(df_pp, categories):
    #global categories
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
        ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        #('ordinal', OrdinalEncoder(categories=categories)),
        #('scaler', StandardScaler())
        ('onehot', OneHotEncoder(categories=categories, handle_unknown="ignore")),   
        ])

    numeric_features = df_pp.select_dtypes(include=['int64', 'float64', 'boolean']).columns
    categorical_features = df_pp.select_dtypes(include=['object']).columns
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    return preprocessor

In [102]:
def preprocessData(df):
    numerics = df.select_dtypes(include=['int32','int64', 'float32', 'float64', 'boolean']).columns
    categoricals = df.select_dtypes(include=['object']).columns
    print(numerics)
    num_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    num_imputer.fit_transform(df[numerics])
    num_scaler = StandardScaler()
    num_scaler.fit_transform(df[numerics])
    
    cat_imputer = SimpleImputer(strategy='constant', fill_value='')
    cat_imputer.fit_transform(df[categoricals])
    cat_ohe = OneHotEncoder(categories = 'auto', handle_unknown="ignore")
    cat_ohe.fit_transform(df[categoricals])
    return df


#df_pp = preprocessData(df_rf)

Index(['pixels_y', 'touchscreen', 'discrete_gpu', 'ram', 'storage',
       'min_price', 'max_price', 'delta_abs', 'delta_rel', 'resolution',
       'gimmick', 'speed', 'Dual-Core', 'Quad-Core', 'Hexa-Core',
       'Hyper-Threading'],
      dtype='object')


In [109]:

X = df_pp.drop(columns=["min_price", "max_price"])
y = df_pp[["min_price", "max_price"]]
y.head(2)
X.head(2)#
#y_pred = runRFR(X,y)

preprocessData(X)

Index(['pixels_y', 'touchscreen', 'discrete_gpu', 'ram', 'storage',
       'delta_abs', 'delta_rel', 'resolution', 'gimmick', 'speed', 'Dual-Core',
       'Quad-Core', 'Hexa-Core', 'Hyper-Threading'],
      dtype='object')


Unnamed: 0,brand,pixels_y,screen_surface,touchscreen,discrete_gpu,gpu,ram,storage,delta_abs,delta_rel,...,gpu_brand,os_type,os_nr,resolution,gimmick,speed,Dual-Core,Quad-Core,Hexa-Core,Hyper-Threading
0,Lenovo,1080,Glossy,1,0,Intel HD,8,1000,0.00,0.000000,...,Intel,10,,69.230769,3.0,5.76,True,False,False,True
1,Razer,1080,Matte,0,1,NVIDIA GeForce RTX 2070 Max-Q,16,512,0.00,0.000000,...,NVIDIA,10 Home,Home,69.230769,5.0,4.84,False,False,True,True
2,HP,768,Glossy,0,0,AMD Radeon R4,8,500,10.00,0.022523,...,AMD,10,,49.230769,0.0,6.25,True,False,False,False
3,Acer,1080,Matte,0,0,Intel UHD 620,6,1000,74.00,0.180097,...,Intel,10 Home,Home,69.230769,0.0,4.84,True,False,False,True
4,HP,900,Glossy,0,0,Intel HD 620,8,1000,0.00,0.000000,...,Intel,10,,52.023121,0.0,6.25,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,Dell,768,Glossy,1,0,Intel HD 4400,4,500,0.00,0.000000,...,Intel,8.1,,49.230769,3.0,3.61,True,False,False,True
506,Asus,1080,Matte,0,1,NVIDIA GeForce GTX 1070,12,1128,0.00,0.000000,...,NVIDIA,10 Home,Home,62.427746,5.0,7.84,False,True,False,True
507,Asus,1080,Matte,0,0,Intel UHD 620,8,1128,169.98,0.292645,...,Intel,10 Home,Home,69.230769,0.0,2.56,False,True,False,True
508,HP,900,Glossy,0,0,Intel HD 620,8,1000,0.00,0.000000,...,Intel,10,,52.023121,0.0,6.25,True,False,False,True


In [110]:
pd.get_dummies(X)

Unnamed: 0,pixels_y,touchscreen,discrete_gpu,ram,storage,delta_abs,delta_rel,resolution,gimmick,speed,...,os_nr_Home Signature Edition,os_nr_IoT,os_nr_Lion,os_nr_Mavericks,os_nr_Pro,os_nr_S,os_nr_Sierra,os_nr_X Lion,os_nr_Yosemite,os_nr_nan
0,1080,1,0,8,1000,0.00,0.000000,69.230769,3.0,5.76,...,0,0,0,0,0,0,0,0,0,1
1,1080,0,1,16,512,0.00,0.000000,69.230769,5.0,4.84,...,0,0,0,0,0,0,0,0,0,0
2,768,0,0,8,500,10.00,0.022523,49.230769,0.0,6.25,...,0,0,0,0,0,0,0,0,0,1
3,1080,0,0,6,1000,74.00,0.180097,69.230769,0.0,4.84,...,0,0,0,0,0,0,0,0,0,0
4,900,0,0,8,1000,0.00,0.000000,52.023121,0.0,6.25,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,768,1,0,4,500,0.00,0.000000,49.230769,3.0,3.61,...,0,0,0,0,0,0,0,0,0,1
506,1080,0,1,12,1128,0.00,0.000000,62.427746,5.0,7.84,...,0,0,0,0,0,0,0,0,0,0
507,1080,0,0,8,1128,169.98,0.292645,69.230769,0.0,2.56,...,0,0,0,0,0,0,0,0,0,0
508,900,0,0,8,1000,0.00,0.000000,52.023121,0.0,6.25,...,0,0,0,0,0,0,0,0,0,1


array(['x0_Acer', 'x0_Alienware', 'x0_Apple', 'x0_Asus', 'x0_Dell',
       'x0_Google', 'x0_HP', 'x0_Huawei', 'x0_Jumper', 'x0_LG',
       'x0_Lenovo', 'x0_MSI', 'x0_Microsoft', 'x0_Other', 'x0_RCA',
       'x0_Razer', 'x0_Samsung', 'x0_Toshiba', 'x1_Glossy', 'x1_Matte',
       'x2_AMD Radeon Pro 5500M', 'x2_AMD Radeon Pro 555X',
       'x2_AMD Radeon R2', 'x2_AMD Radeon R4', 'x2_AMD Radeon R5',
       'x2_AMD Radeon R6', 'x2_AMD Radeon R7', 'x2_AMD Radeon RX 560X',
       'x2_AMD Radeon Vega 10', 'x2_AMD Radeon Vega 3',
       'x2_AMD Radeon Vega 6', 'x2_AMD Radeon Vega 8',
       'x2_AMD RadeonÂ\xa0R4', 'x2_Imagination PowerVR GX6250',
       'x2_Intel HD', 'x2_Intel HD (UMA)', 'x2_Intel HD 3000',
       'x2_Intel HD 400', 'x2_Intel HD 4000', 'x2_Intel HD 405',
       'x2_Intel HD 4200', 'x2_Intel HD 4400', 'x2_Intel HD 500',
       'x2_Intel HD 5000', 'x2_Intel HD 505', 'x2_Intel HD 510',
       'x2_Intel HD 515', 'x2_Intel HD 520', 'x2_Intel HD 5500',
       'x2_Intel HD 6000', 'x2

In [122]:
def create_preprocessor(df_pp):
    #global categories
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
        ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        #('ordinal', OrdinalEncoder(categories=categories)),
        #('scaler', StandardScaler())
        ('onehot', OneHotEncoder(categories='auto', handle_unknown="ignore")),   
        ])

    numeric_features = df_pp.select_dtypes(include=['int64', 'float64', 'boolean']).columns
    categorical_features = df_pp.select_dtypes(include=['object']).columns
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    return preprocessor

In [131]:
preprocessor = create_preprocessor(df_train)
encoded = preprocessor.fit_transform(df_train)
df_pp = pd.DataFrame(encoded.toarray())
encoded.get_features

AttributeError: get_features not found

(510, 1266)