# XGBOOST NOTEBOOK

In [5]:
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train_lab = train.iloc[:,-1]
train = train.iloc[:,:-1]

train_len = train.shape[0]


In [6]:
print(test["Id"])

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64


In [7]:
all = pd.concat([train, test])


In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
def data_processor(df, train_len):
    
    types = df.dtypes
    cat_dict = {}
    cat_labels = []
    new_arr = []
    first = True
    non_cat = []

    for ke, type in zip(df.columns, types):
        if type == "object":
            le = OneHotEncoder()
            if df[ke].isnull().values.any():
                ifnan = True
            else:
                ifnan = False

            if ifnan:
                unique = np.expand_dims(np.asarray(["nan"] + list(df[ke].dropna().unique())), axis = 1)
            else:
                unique = np.expand_dims(np.asarray(list(df[ke].dropna().unique())), axis = 1)


            df[ke] = df[ke].fillna("nan")

            cat = le.fit_transform(np.expand_dims(df[ke], axis = 1)).toarray().T
            if first:
                first = False
                categorical= cat
            else:
                categorical = np.concatenate((categorical, cat), axis = 0)

            keys = le.categories_
            ka = [k for k in keys]
            cat_dict[ke] = ka
            cat_labels.extend(ka)
        else:
            non_cat.append(ke)
    categorical = categorical.T

    

    df_non_cat = df[[t for t in df.columns if t in non_cat]]
    print(df_non_cat.shape)

    df_arr = np.concatenate((df_non_cat, categorical), axis = 1)
    print(df_arr.shape)


    train = df_arr[:train_len,1:]
    test = df_arr[train_len:,1:]
    test_idx = df_arr[train_len:,0]


    return train, test, test_idx

    



In [9]:
train_arr, test_arr, test_idx = data_processor(all, train_len)




(2919, 37)
(2919, 312)


In [10]:
print(test_idx)

[1461. 1462. 1463. ... 2917. 2918. 2919.]


## Impute Missing Data

In [11]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors = 5)

imputer.fit(train_arr)
train_arr = imputer.transform(train_arr)
test_arr = imputer.transform(test_arr)

## Regression Trees

In [12]:
import xgboost as xgb
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
xg_reg.fit(train_arr, train_lab)

y_pred = xg_reg.predict(train_arr)

from sklearn.metrics import r2_score
print(r2_score(train_lab,y_pred))

0.999521505381623


In [13]:
### hyperparameter tuning
from sklearn.model_selection import cross_val_score
alphas_xg = [s/2 for s in range(1,20)]

cv_score_xg = []
for alp in alphas_xg:
    alp = alp/2
    regr = xgb.XGBRegressor(objective ='reg:squarederror', reg_alpha = alp, reg_lambda = alp)
    scores = cross_val_score(regr, train_arr, train_lab, cv = 5, scoring = 'r2')
    cv_score_xg.append(scores.mean())

In [14]:
max_val = max(cv_score_xg)
indices_max = [i for i,v in enumerate(cv_score_xg) if v == max_val]
print(max_val)
max_alphas_rf = [v for i,v in enumerate(alphas_xg) if i in indices_max]
print(max_alphas_rf)

0.8735052176418175
[4.0]


### Importing test data

In [15]:
xg_reg_final = xgb.XGBRegressor(objective ='reg:squarederror', reg_alpha = alp, reg_lambda = alp)
xg_reg_final.fit(train_arr, train_lab)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=40, num_parallel_tree=1, random_state=0,
             reg_alpha=4.75, reg_lambda=4.75, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [16]:
test_preds = xg_reg_final.predict(test_arr)

In [25]:
test_pred_df = pd.DataFrame({"Id":[int(x) for x in test_idx], "SalePrice":test_preds})
test_pred_df = test_pred_df.set_index("Id")

In [28]:
test_pred_df.to_csv("submission.csv")

In [29]:
print(test_pred_df)

          SalePrice
Id                 
1461  128812.609375
1462  164200.546875
1463  180234.906250
1464  193567.625000
1465  183337.109375
...             ...
2915   75945.093750
2916   87261.507812
2917  165403.312500
2918  118890.593750
2919  235937.734375

[1459 rows x 1 columns]
