In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./sample.csv")

In [3]:
df.head()

Unnamed: 0,loc1,loc2,para1,dow,para2,para3,para4,price
0,0,1,1,Mon,662,3000.0,3.8,73.49
1,9,99,1,Thu,340,2760.0,9.2,300.0
2,0,4,0,Mon,16,2700.0,3.0,130.0
3,4,40,1,Mon,17,12320.0,6.4,365.0
4,5,50,1,Thu,610,2117.0,10.8,357.5


In [5]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m[31m4.0 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5
Note: you may need to restart the kernel to use updated packages.


In [15]:
from sklearn.model_selection import train_test_split    # Splits arrays or matrices into random train and test subsets
from sklearn.model_selection import KFold               # Cross-validator
from sklearn.model_selection import cross_validate      # Evaluate metrics by cross-validation
from sklearn.model_selection import GridSearchCV        # Search over specified parameter values for an estimator
from sklearn.compose import ColumnTransformer           # Applies transformers to columns of DataFrames
from sklearn.pipeline import Pipeline                   # Helps building a chain of transforms and estimators
from sklearn.impute import SimpleImputer                # Imputation transformer for completing missing values
from sklearn.preprocessing import OneHotEncoder, StandardScaler         # Encode categorical features
from sklearn.metrics import mean_absolute_error, mean_squared_error         # One of many statistical measures of error
from xgboost import XGBRegressor

In [7]:
X = df.copy()
y = X.price
X.drop(['price'], axis = 1, inplace = True)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, 
                                                                test_size=0.2, 
                                                                random_state=42)

In [9]:
X_train_proper, X_valid, y_train_proper, y_valid = train_test_split(X_train, y_train, train_size=0.8, 
                                                                test_size=0.2, 
                                                                random_state=42)

In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 9254 to 7270
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   loc1    8000 non-null   object 
 1   loc2    8000 non-null   object 
 2   para1   8000 non-null   int64  
 3   dow     8000 non-null   object 
 4   para2   8000 non-null   int64  
 5   para3   8000 non-null   float64
 6   para4   8000 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 500.0+ KB


In [12]:
print(X_train.isnull().sum())

loc1     0
loc2     0
para1    0
dow      0
para2    0
para3    0
para4    0
dtype: int64


In [13]:
# Select categorical column
categorical_cols = [col for col in X_train_proper.columns if 
                   X_train_proper[col].dtype == 'object']

# Select numeric values
numeric_cols = [col for col in X_train_proper.columns if
                X_train_proper[col].dtype in ['int64', 'float64']]

In [16]:
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',  StandardScaler())
])

categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers = [
    ('num', numerical_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [22]:
scores = {}

from sklearn.linear_model import LinearRegression
model = LinearRegression()

my_pipeline_LR = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

my_pipeline_LR.fit(X_train_proper, y_train_proper)
preds = my_pipeline_LR.predict(X_valid)
score = my_pipeline_LR.score(X_valid, y_valid)
scores['Linear Regression'] = score
print("LR Score:", score)

LR Score: 0.5792392647905873


In [23]:
model = XGBRegressor(verbosity = 0, random_state = 0)

my_pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

my_pipeline_xgb.fit(X_train_proper, y_train_proper)
preds = my_pipeline_xgb.predict(X_valid)
score = my_pipeline_xgb.score(X_valid, y_valid)
scores['XGB'] = score
print("XGB Score:", score)

XGB Score: 0.6800399527719243


In [24]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state = 0)

my_pipeline_RFR = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

my_pipeline_RFR.fit(X_train_proper, y_train_proper)
preds = my_pipeline_RFR.predict(X_valid)
score = my_pipeline_RFR.score(X_valid, y_valid)
scores['RFR'] = score
print("RFR Score:", score)

RFR Score: 0.703125787675623


In [26]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [50, 100, 150],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(estimator=my_pipeline_RFR, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)


In [27]:
print(grid_search.best_params_)

{'model__max_depth': 20, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 150}


In [29]:
print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)

Best parameter (CV score=0.682):


In [30]:
final_model = RandomForestRegressor(random_state = 42, max_depth = 20, min_samples_leaf=1, min_samples_split=5, n_estimators = 150)
final_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', final_model)
])

final_pipeline.fit(X_train, y_train)

In [31]:
final_pipeline.score(X_test, y_test)

0.6614676345105492