In [13]:
from copy import deepcopy
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import train_test_split

In [14]:
raw_train = pd.read_csv("Data/train.csv")
raw_test = pd.read_csv("Data/test.csv")
targets = ["max_price", "delta_abs", "delta_rel", "min_price"]


In [15]:
def shared_reformat(df):
    df = deepcopy(df)
    
    # create special targets (only on train set)
    if "max_price" in list(df):
        df["delta_abs"] = df["max_price"] - df["min_price"] 
        df["delta_rel"] = np.log(df["max_price"] / df["min_price"] )
    
    # deal with missing data
    #df.pixels_y = df.pixels_y.fillna(0)
    df.detachable_keyboard.fillna(0, inplace = True)  # only four cases. Assume none
    df.pixels_x.fillna(1920, inplace = True)   # only 2 cases, replace by most common value
    df.pixels_y.fillna(1080, inplace = True)   # only 2 cases, replace by most common value
    df.loc[df.screen_surface.isna(), "screen_surface"] = 'Glossy'       # 12 cases, large majority is Glossy,
    df.loc[df.screen_surface.isna() & df.pixels_x == 1920, "screen_surface"] = 'Matte'  # except here
    df.sort_values(by=['name'])
    df.cpu_details.fillna(method='bfill', inplace=True)  # take previous value, they are often similar
    df.gpu.fillna(method='bfill', inplace=True)  # take previous value, they are often similar
    df.weight.fillna(method='bfill', inplace=True)
    df.loc[df.os.isna() & df.brand == "Apple", "os"] = 'macOS'  # 1 case
    df.loc[df.os_details.isna() & df.brand == "Apple", "os_details"] = "macOS Mojave"  # 1 case, most common
    df.loc[df.os.isna() & df.brand == "Dell", "os"] = 'Windows'  # 1 case
    df.loc[df.os_details.isna() & df.brand == "Dell", "os_details"] = "Windows 10 Home"  # 1 case, most common
    

    # create new features
    df["cpu_brand"] = df.cpu.str.split(n=1).str[0]
    df["cpu_type"] = df.cpu.str.split(n=1).str[1]
    df["gpu_brand"] = df.gpu.str.split(n=1).str[0]
    df["gpu_series"] = df.gpu.str.split(n=2).str[1]
    df["os_type"] = df.os_details.str.split(n=1).str[1]
    df["os_nr"] = df.os_details.str.split(n=2).str[2]
    df["resolution"] = df.pixels_y / df.screen_size
    df["gimmick"] = df.detachable_keyboard + 5 * df.discrete_gpu + 3* df.touchscreen
    def get_speed(sentence):
        try:
            sentence = sentence.split()
            index = sentence.index('GHz')
            return pow(float(sentence[index-1]),2)
        except:
            return pow(2.4, 2)
    df["speed"] = df.cpu_details.apply(get_speed)
    def has_word(sentence, word):
        try:
            return word in sentence
        except:
            return False
    df["Dual-Core"] = df.cpu_details.apply(has_word, word="Dual-Core")
    df["Quad-Core"] = df.cpu_details.apply(has_word, word="Quad-Core")
    df["Hexa-Core"] = df.cpu_details.apply(has_word, word="Hexa-Core")
    df["Hyper-Threading"] = df.cpu_details.apply(has_word, word="Hyper-Threading")
    
    
    # correct a few mistakes in new features
    map_gpu_series = {"GeFoce":"GeForce", "RadeonÂ": "Radeon"}
    df.gpu_series = df.gpu_series.replace(map_gpu_series) 
    map_screen_surface = {"glossy":"Glossy", "matte":"Matte"}
    df.screen_surface = df.screen_surface.replace(map_screen_surface)  

    
    # make sure all values in categorical variables are strings
    cat_col = df.select_dtypes(include=['object']).columns
    df[cat_col] = df[cat_col].astype(str)
    
    # cleanup test set
    df.pixels_y = df.pixels_y.astype('int64')
    
    # shuffle the database
    df.sample(frac=1)
    return df


In [16]:
df_train = shared_reformat(raw_train)
df_test = shared_reformat(raw_test)

drop_cols=["id", "name", "base_name", "screen_size", "weight", "detachable_keyboard", "gpu",
              "ssd", "pixels_x", "cpu_details", "os_details", "cpu", "min_price", "max_price", "delta_abs", "delta_rel"]

drop_cols_test=["id", "name", "base_name", "screen_size", "weight", "detachable_keyboard", "gpu",
              "ssd", "pixels_x", "cpu_details", "os_details", "cpu"]
X = df_train.drop(columns=drop_cols, axis=1)
y = df_train[["min_price", "max_price"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(df_train.columns)
print(X.columns)
print(X.head(2))
X.dtypes

Index(['id', 'name', 'brand', 'base_name', 'screen_size', 'pixels_x',
       'pixels_y', 'screen_surface', 'touchscreen', 'cpu', 'cpu_details',
       'detachable_keyboard', 'discrete_gpu', 'gpu', 'os', 'os_details', 'ram',
       'ssd', 'storage', 'weight', 'min_price', 'max_price', 'delta_abs',
       'delta_rel', 'cpu_brand', 'cpu_type', 'gpu_brand', 'gpu_series',
       'os_type', 'os_nr', 'resolution', 'gimmick', 'speed', 'Dual-Core',
       'Quad-Core', 'Hexa-Core', 'Hyper-Threading'],
      dtype='object')
Index(['brand', 'pixels_y', 'screen_surface', 'touchscreen', 'discrete_gpu',
       'os', 'ram', 'storage', 'cpu_brand', 'cpu_type', 'gpu_brand',
       'gpu_series', 'os_type', 'os_nr', 'resolution', 'gimmick', 'speed',
       'Dual-Core', 'Quad-Core', 'Hexa-Core', 'Hyper-Threading'],
      dtype='object')
    brand  pixels_y screen_surface  touchscreen  discrete_gpu       os  ram  \
0  Lenovo      1080         Glossy            1             0  Windows    8   
1   Razer     

  res_values = method(rvalues)


brand               object
pixels_y             int64
screen_surface      object
touchscreen          int64
discrete_gpu         int64
os                  object
ram                  int64
storage              int64
cpu_brand           object
cpu_type            object
gpu_brand           object
gpu_series          object
os_type             object
os_nr               object
resolution         float64
gimmick            float64
speed              float64
Dual-Core             bool
Quad-Core             bool
Hexa-Core             bool
Hyper-Threading       bool
dtype: object

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

numeric_features = X.select_dtypes(['int64', 'float64']).columns

categorical_features = X.select_dtypes(include=['object']).columns

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer( 
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [18]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, IsolationForest

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestRegressor())])

rf.fit(X, y)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [19]:
y_pred = rf.predict(X)
print(y_pred)

[[ 893.65153333  894.8186    ]
 [2170.43811905 2273.7352381 ]
 [ 437.59293333  450.53963333]
 ...
 [ 614.77053333  720.07193   ]
 [ 569.768565    570.982465  ]
 [1496.60416667 1544.44571429]]


In [20]:
#def seppe_error(Y_true, Y_pred):
#    return (mean_absolute_error(Y_true.min_price, Y_pred[:,0]) + mean_absolute_error(Y_true.max_price, Y_pred[:,1])) / 2

#seppe_error(y_test, y_pred)

In [21]:
def seppe_error(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred, multioutput=[0.5, 0.5])

seppe_error(y, y_pred)

64.79213283772958

In [22]:
X_eval = df_test.drop(columns=drop_cols_test, axis=1)
print(X_eval.columns)
#
y_test_pred = rf.predict(X_eval)
X_train.columns

Index(['brand', 'pixels_y', 'screen_surface', 'touchscreen', 'discrete_gpu',
       'os', 'ram', 'storage', 'cpu_brand', 'cpu_type', 'gpu_brand',
       'gpu_series', 'os_type', 'os_nr', 'resolution', 'gimmick', 'speed',
       'Dual-Core', 'Quad-Core', 'Hexa-Core', 'Hyper-Threading'],
      dtype='object')


Index(['brand', 'pixels_y', 'screen_surface', 'touchscreen', 'discrete_gpu',
       'os', 'ram', 'storage', 'cpu_brand', 'cpu_type', 'gpu_brand',
       'gpu_series', 'os_type', 'os_nr', 'resolution', 'gimmick', 'speed',
       'Dual-Core', 'Quad-Core', 'Hexa-Core', 'Hyper-Threading'],
      dtype='object')

In [23]:
result = pd.DataFrame(y_test_pred)
result["id"] = df_test.id.values

result.head()

Unnamed: 0,0,1,id
0,1101.3872,1190.5444,28807
1,372.1072,386.9061,22559
2,785.3011,802.8406,28647
3,537.964,551.9143,22141
4,1544.585843,1610.699887,26116


In [24]:
result.columns = ["min_price", "max_price", "ID"]
result.set_index(keys = 'ID')
result = result[[ "ID", "min_price", "max_price"]]
result.head()
result.to_csv('Data/jeroen_result.csv', index=False)

In [None]:
params = {
        'classifier__n_estimators':[200, 500],
        'classifier__max_features' : [.2, .5, None],
        'classifier__max_samples' : [.2, .5, None],
        'classifier__ccp_alpha' : [0, .01, .1],
        'classifier__oob_score' : [True, False],
        'classifier__max_depth' : [10, 20, 50]
    }
scoring = {'Seppe error': make_scorer(seppe_error, greater_is_better=False)}
CV = GridSearchCV(rf, params, scoring='neg_mean_absolute_error', cv=5,n_jobs= -1)
                  
CV.fit(X_train, y_train)
print(CV.best_params_)    
print(CV.best_score_)

In [None]:
result = pd.DataFrame(CV.predict(X_eval))
result['ID'] = df_test.id.values
result.columns = ["min_price", "max_price", "ID"]
result.set_index(keys = 'ID')
result = result[[ "ID", "min_price", "max_price"]]
result.head()
result.to_csv('Data/jeroen_result.csv', index=False)

In [None]:
result.head()

In [None]:
params = {
        'if__n_estimators' : [500, 300],
        'if__max_samples' : [0.2, 0.5, 0.7, 1],
        'if__contamination' : ['auto'],#[0, 0.1, 0.2, 0.3, 0.4, 0.5, 'auto'],
        'if__max_features' : [0.2, 0.5, 1.0]
      }
drop_cols_if=["id", "name", "base_name", "screen_size",  
              "pixels_x", "cpu_details", "os_details", "delta_abs", "delta_rel"]
df_if = df_train.drop(columns=drop_cols, axis=1)

isolf = Pipeline(steps=[('preprocessor', preprocessor), 
                        ('if', IsolationForest(n_estimators=200, max_samples=.5))])
forest = isolf.fit(df_train)
bin_pred = forest.predict(df_train)
y = pd.DataFrame(forest.decision_function(df_train))

In [None]:
y['ID'] = df_train.id.values
print(y.sort_values(by=0, ascending=True).head(50))

tst = pd.DataFrame(bin_pred)
tst['ID'] = df_train.id.values
print(tst.sort_values(by=0, ascending=True).head(50))

In [None]:
#df_train.loc[[23417, 29844, 28976],'id']
outlier_df = df_train.loc[df_train.id.isin([23417, 29844, 28976, 31421, 12766, 19313, 29202])]

outlier_df

In [None]:
df_train.pixels_x.values()
