In [None]:
from warnings import simplefilter

simplefilter(action='ignore', category=FutureWarning)

import numpy as np 
import pandas as pd 
import  matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
df.head(20) 

In [None]:
df.columns #Overview of columns

In [None]:
full_list = df.isnull().sum().sort_values(ascending=False) #check for nulls

full_list.head(20)

In [None]:
#Missing data
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)


In [None]:
df_test.head()

In [None]:
df.head()

In [None]:
df_corr = df.corr() #create heatmap to asses which features corrolate the most with sale price
f, ax = plt.subplots(figsize=(40, 15))
sns.heatmap(df_corr, vmax=.8, square=True);

In [None]:
k = 10 #number of variables for heatmap
cols = df_corr.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
corr_features = df[['OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF','1stFlrSF','FullBath','TotRmsAbvGrd','YearBuilt']]
plt.figure(figsize=(30,30))
i = 1

for numerical_feature in numerical_features:
    plt.subplot(3,3,i)
    i=i+1
    
    sns.scatterplot(x=numerical_features[numerical_feature], y=df['SalePrice'])
    
    plt.title(str(numerical_feature))


In [None]:
predictors = df.drop(['SalePrice','Id'], axis=1) #define variables and split data
target = df["SalePrice"]
test_data = df_test.drop(['Id'], axis=1)

x_train, x_valid, y_train, y_valid = train_test_split(predictors, target, random_state = 0)

In [None]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in x_train.columns if
                    x_train[cname].nunique() < 10 and 
                    x_train[cname].dtype == "object"]


# Select numerical columns
numerical_cols = [cname for cname in x_train.columns if 
                x_train[cname].dtype in ['int64', 'float64']]

In [None]:
from sklearn.impute import SimpleImputer


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')


# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Choose best model to use
xgb = XGBRegressor()
rf = RandomForestRegressor()
dtree = DecisionTreeRegressor()

models = [xgb,rf,dtree]




for model in models:
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),# Bundle preprocessing and modeling code in a pipeline
                      ('model', model)])
    
    my_pipeline.fit(x_train, y_train,) 
    train_preds = my_pipeline.predict(x_train)
    test_preds = my_pipeline.predict(x_valid) 
    print('Model Report')
    print('\n',str(model))
    print('\nTraining accuracy:', r2_score(y_train, train_preds))
    print('Test accuracy:', r2_score(y_valid, test_preds))
  

    

In [None]:
def timer(start_time=None): #Define timer function
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


In [None]:
# Parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'colsample_bynode': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [None]:
#Randomized search CV
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, n_jobs=4, cv=skf.split(x_train,y_train), verbose=3, random_state=1001 )
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', random_search)])



start_time = timer(None) # timing starts from this point for "start_time" variable
my_pipeline.fit(x_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

In [None]:
xgb = random_search.best_estimator_

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', xgb)])
    
my_pipeline.fit(x_train, y_train,) 
train_preds = my_pipeline.predict(x_train)
test_preds = my_pipeline.predict(x_valid) 
print('Model Report')
print('\n',str(model))
print('\nTraining accuracy:', r2_score(y_train, train_preds))
print('Test accuracy:', r2_score(y_valid, test_preds))



In [None]:
test_predictions = my_pipeline.predict(test_data) #Get test predictions

In [None]:
# Save test predictions to file
output = pd.DataFrame({'Id': df_test['Id'],
                       'SalePrice': test_predictions})
output.to_csv('submission.csv', index=False)