In [26]:
import pandas as pd
import numpy as np
import xgboost
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, mean_squared_error, auc, roc_curve
from sklearn.model_selection import KFold , train_test_split, cross_val_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [39]:
train_data = pd.read_csv('train_clean.csv')
test_data = pd.read_csv('test_clean.csv')
train_data.drop(columns=['Unnamed: 0'],inplace=True)
test_data.drop(columns=['Unnamed: 0'],inplace=True)
samplesub = pd.read_csv('sample_submission.csv')
samplesub.drop(columns='SalePrice',inplace=True)

In [28]:
X_train = train_data.copy()
y_train = X_train.SalePrice
X_train = X_train.drop(columns=['SalePrice'])
X_test = test_data.copy()

# Naive fitting

In [34]:
naivemodel = xgboost.XGBRegressor()
naivemodel.fit(X_train,y_train)
y_pred = naivemodel.predict(X_test)

In [58]:
naivemodel

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [52]:
kfold = KFold(n_splits=5, random_state=7,shuffle=True)
accuracy = cross_val_score(estimator = naivemodel, X = X_train, y = y_train, cv = kfold,scoring='neg_mean_squared_error').mean()

In [47]:
naiveresults = samplesub.copy()
naiveresults['SalePrice'] = y_pred
naiveresults = naiveresults.set_index('Id')

In [48]:
naiveresults.to_csv('naiveresults.csv')
# MAE: 16234.42
# Ranking: 6634/49372 Top 13.4%

# XGBoost with Hyperparameter Tuning with HyperOpt

In [29]:
space={'max_depth': hp.randint('max_depth', 20),
       'gamma': hp.uniform ('gamma', 0,2),
       'colsample_bytree' : hp.uniform('colsample_bytree', 0.001,1),
       'n_estimators': hp.randint("n_estimators", 500),
       'learning_rate': hp.uniform('learning_rate',0, 1),
       'subsample': hp.uniform('subsample',0, 1),
       'min_child_weight': hp.uniform('min_child_weight',0, 3),
       #'reg_alpha' : hp.uniform('reg_alpha', 0,2),
       'reg_lambda' : hp.uniform('reg_lambda', 0,2)
    }

In [59]:
def hyperparameter_tuning(space):
    model = xgboost.XGBRegressor(random_state = 42,n_estimators =int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'],
                                colsample_bytree=space['colsample_bytree'], learning_rate = space['learning_rate'],subsample = space['subsample'],
                                min_child_weight = space['min_child_weight'],reg_lambda = space['reg_lambda'])#,reg_alpha = space['reg_alpha'])
    evaluation = [(X_train,y_train)]
    kfold = KFold(n_splits=5, random_state=7,shuffle=True)
    accuracy = cross_val_score(estimator = model, X = X_train, y = y_train, cv = kfold,scoring='neg_mean_squared_error').mean()
    print ("SCORE: "+ str(accuracy))
    #change the metric if you like
    return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}

In [60]:
rstate = np.random.RandomState(42)
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=400,
            trials=trials,
            rstate=rstate)

SCORE: -1439837650.7881343                             
SCORE: -2347027439.189975                                                        
SCORE: -1923723009.8453536                                                       
SCORE: -911852106.9673121                                                        
SCORE: -1062788138.1060183                                                       
SCORE: -2112834487.0875957                                                      
SCORE: -1764203998.4652984                                                      
SCORE: -1361009835.82039                                                        
SCORE: -1113258174.0526948                                                      
SCORE: -1854279404.9689274                                                      
SCORE: -46916785521.82189                                                        
SCORE: -1052512737.1123583                                                       
SCORE: -1492922298.949259                      

In [62]:
hoptmodel1 = xgboost.XGBRegressor(**best)
hoptmodel1.fit(X_train,y_train)
y_pred_hoptmodel1 = hoptmodel1.predict(X_test)

In [65]:
hoptmodel1results = samplesub.copy()
hoptmodel1results['SalePrice'] = y_pred_hoptmodel1
hoptmodel1results= hoptmodel1results.set_index('Id')

In [66]:
hoptmodel1results.to_csv('hoptmodel1results.csv')
#MAE : 13877.53187
#Ranking: 1253/49372, Top 2.53%