In [1]:
# Google Drive 
# import sys
# sys.path.append('/content/drive/MyDrive/30 Days of ML')

### Environment Set Up

In [2]:
import pandas as pd
import numpy as np
from eda_function import *

In [3]:
test = pd.read_csv("./data/test.csv")
train = pd.read_csv("./data/train.csv")
sample = pd.read_csv("./data/sample_submission.csv")

In [4]:
train.dtypes

id          int64
cat0       object
cat1       object
cat2       object
cat3       object
cat4       object
cat5       object
cat6       object
cat7       object
cat8       object
cat9       object
cont0     float64
cont1     float64
cont2     float64
cont3     float64
cont4     float64
cont5     float64
cont6     float64
cont7     float64
cont8     float64
cont9     float64
cont10    float64
cont11    float64
cont12    float64
cont13    float64
target    float64
dtype: object

### Data Processing

In [5]:
X = train.drop(columns=["id","target"])
y = train["target"]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
s = (X_train.dtypes =="object")
object_cols = list(s[s].index)
print(f"Categorical variables:\n {object_cols}")

Categorical variables:
 ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


In [8]:
r = (X_train.dtypes =="float64")
numercial_cols = list(r[r].index)
print(f"Numercial variables:\n {numercial_cols}")

Numercial variables:
 ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']


In [9]:
df_summary(train)

Unnamed: 0,dtype,num_missing,num_uniques
id,int64,0,300000
cat0,object,0,2
cat1,object,0,2
cat2,object,0,2
cat3,object,0,4
cat4,object,0,4
cat5,object,0,4
cat6,object,0,8
cat7,object,0,8
cat8,object,0,7


In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [11]:
OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [12]:
print(OH_X_train.shape)
print(y_train.shape)

(210000, 70)
(210000,)


In [13]:
print(OH_X_valid.shape)
print(y_valid.shape)

(90000, 70)
(90000,)


In [26]:
test_new = test.drop(columns=["id"])

In [28]:
OH_cols_test = pd.DataFrame(OH_encoder.transform(test_new[object_cols]))
# OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

OH_cols_test.index = test_new.index
# OH_cols_train.index = X_train.index
# OH_cols_valid.index = X_valid.index

num_new_test = test_new.drop(object_cols, axis=1)
# num_X_train = X_train.drop(object_cols, axis=1)
# num_X_valid = X_valid.drop(object_cols, axis=1)

OH_test = pd.concat([num_new_test, OH_cols_test],axis=1)
# OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
# OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

### Basic Models

In [14]:
from sklearn.ensemble import GradientBoostingRegressor

In [15]:
xgb = GradientBoostingRegressor()

In [16]:
xgb.fit(OH_X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [18]:
pred = xgb.predict(OH_X_valid)

In [20]:
from sklearn.metrics import mean_squared_error

mse_score = round(mean_squared_error(y_valid,pred),2)
print(mse_score)

0.54


In [21]:
basic_pred = sample.copy()

In [24]:
basic_pred.head()

Unnamed: 0,id,target
0,0,0.5
1,5,0.5
2,15,0.5
3,16,0.5
4,17,0.5


In [29]:
basic_pred["target"] = xgb.predict(OH_test)

In [30]:
basic_pred.head()

Unnamed: 0,id,target
0,0,8.029547
1,5,8.287323
2,15,8.371844
3,16,8.308859
4,17,8.217338


In [31]:
# basic_pred.to_csv("basic_xgb.csv",index=False)

In [33]:
# from google.colab import files
# files.download("basic_xgb.csv")