In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## In this notebook, I will take a different approach. I will use Concepts like 
1. **Feature Engineering with - LabelEncoder, SimpleImputer, BOX-COX Transfomtion, IsolationForest, RobustScalar**
2. **Feature Selection with - SelectKBest**
3. **Normal Equation**
4. **RMLSE evaluation**
5. **Feature Importance with - SHAP Values**

## For more traditional approach, (EDA + Excellent Data Viz + XGBoost HyperParameter Tuning + Feature Engineering Tutorial)
you may visit:-[https://www.kaggle.com/code/pythonkumar/xgboost-hyperparameters-excellent-plots-acc-91?kernelSessionId=94478268]

In [None]:
import numpy as np
import pandas as pd

In [None]:
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
Id=test['Id']
train
# test

# Drop some USELESS Columns

In [None]:
train=train.drop(columns=['SalePrice','Id'], axis=1)
train

In [None]:
test=test.drop(columns=['Id'], axis=1)
test

# Changing Infinite to Nan

In [None]:
pd.set_option('mode.use_inf_as_na', True)

# Columns having Null Values

In [None]:
null_train=train.columns[train.isnull().any()]
null_train

In [None]:
null_test=test.columns[test.isnull().any()]
null_test

# Seperate Numerical & Categorical columns

In [None]:
num_train = train.select_dtypes(include=['int64','float64','UInt32'])
cat_train = train.select_dtypes(include=['object','string'])
num_train
# cat_train

In [None]:
num_test = test.select_dtypes(include=['int64','float64','UInt32'])
cat_test = test.select_dtypes(include=['object','string'])
# num_test
cat_test

# Handling Categorical Features

Most ML Models cannot work with non-numeric values, so we need to apply some form of transformation of these categorical values into numeric labels and then applying some encoding scheme on these values.

* Nominal attributes consist of discrete categorical values with no notion or sense of order amongst them.
* Ordinal attributes are categorical attributes with a sense of order amongst the values.

In [None]:
# Label Encoding - This transformer should be used to encode 1 COLUMN at a Time
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for cat in cat_train:
    train[cat]=le.fit_transform(train[cat])
    
train

In [None]:
for cat in cat_test:
    test[cat]=le.fit_transform(test[cat])
test

# Handling Missing Values

Missing values are one of the most common problems you can encounter when you try to prepare your data for machine learning. The reason for the missing values might be human errors,interruptions in the data flow, privacy concerns, and so on. Whatever is the reason, missing values affect the performance of the machine learning models.

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy='most_frequent')

train[null_train] = imp.fit_transform(train[null_train])
train.isnull().sum().sum()

In [None]:
test[null_test] = imp.fit_transform(test[null_test])
test.isnull().sum().sum()

# Handling Outliers
In statistics, an outlier is a data point that differs significantly from other observations. An outlier may be due to variability in the measurement or it may indicate experimental error; the latter are sometimes excluded from the data set. An outlier can cause serious problems in statistical analyses.

In [None]:
# Using Isolation Forest
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=0.3)

out = iso.fit_predict(train)

# select all rows that are not outliers
train[out != -1]
train

In [None]:
out = iso.fit_predict(test)

# select all rows that are not outliers
test[out != -1]
test

# Box Cox Transformation - Both Discreet & Continuous Values

A Box Cox transformation is a transformation of non-normal dependent variables into a normal shape. Normality is an important assumption for many statistical techniques; if your data isn’t normal, applying a Box-Cox means that you are able to run a broader number of tests.

In [None]:
# from scipy.stats import skew
# from scipy.special import boxcox1p
# from scipy.stats import boxcox_normmax

# # Fixing Skewness
# for feat in num_train:
#         train[feat] = boxcox1p(train[feat], boxcox_normmax(train[feat] + 1))
# train

In [None]:
# from scipy.stats import skew
# from scipy.special import boxcox1p
# from scipy.stats import boxcox_normmax

# # Fixing Skewness
# for feat in num_test:
#         test[feat] = boxcox1p(test[feat], boxcox_normmax(test[feat] + 1))
# test

# Splitting X & y

In [None]:
train1=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
y=train1['SalePrice']
X=train
# y
X

# Train Test Split

It splits the train data into 4 parts, X_train, X_test, y_train, y_test.

X_train, y_train first used to train the algorithm.
X_test is used in that trained algorithms to predict outcomes.
Once we get the outcomes, we compare it with y_test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# X_train
# X_test
# y_train
y_test

# **XGBoost Regressor - Model & Training** 

Support Vector Regression is a supervised learning algorithm that is used to predict discrete values. Support Vector Regression uses the same principle as the SVMs. The basic idea behind SVR is to find the best fit line. In SVR, the best fit line is the hyperplane that has the maximum number of poi

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)

model.fit(X_train, y_train)

model.get_params()

# Predicting from XGB Model

In [None]:
pred=model.predict(X_test)
pred

# Scoring the Model

In [None]:
model.score(X_test,pred)

# 

# Feature Importance

In [None]:
import plotly.express as px

# get importance
importance = model.feature_importances_

# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

## In this notebook, I am using a different approach. I will use Concepts like 
1. **Feature Engineering with - LabelEncoder, SimpleImputer, BOX-COX Transfomtion, IsolationForest, RobustScalar**
2. **Feature Selection with - SelectKBest**
3. **Normal Equation**
4. **RMLSE evaluation**
5. **Feature Importance with - SHAP Values**

## For more traditional approach, (EDA + Excellent Data Viz + XGBoost HyperParameter Tuning + Feature Engineering Tutorial)
you may visit:-[https://www.kaggle.com/code/pythonkumar/xgboost-hyperparameters-excellent-plots-acc-91?kernelSessionId=94478268]

# Feature Selection - for Normal Eqn

Irrelevant or partially relevant features can negatively impact model performance.Feature Selection is the process where you automatically or manually select those features which contribute most to your prediction variable or output in which you are interested in.
* **Reduces Overfitting:** Less redundant data means less opportunity to make decisions based on noise.
* **Improves Accuracy:** Less misleading data means modeling accuracy improves.
* **Reduces Training Time:** fewer data points reduce algorithm complexity and algorithms train faster.

In [None]:
# # Select features according to the k highest scores.
# from sklearn.feature_selection import SelectKBest, chi2

# select = SelectKBest(chi2, k=20)

# ntrain=select.fit_transform(X,y)
# print(ntrain.shape)
# ntrain=pd.DataFrame(ntrain)
# ntrain

In [None]:
# imp=select.get_feature_names_out(input_features=X.columns)
# imp

In [None]:
# # Select same features as the TRAIN set features according to the k highest scores.
# test=pd.DataFrame(test)
# ntest=test[imp]
# print(ntest.shape)
# ntest

# Splitting X & y again - for Normal Eqn

In [None]:
# ny=train1['SalePrice'].iloc[29:49]
# nX=ntrain.iloc[29:49]
# # nX=np.append(np.ones((20,1)),ntrain[:20],axis=1)
# # ny
# nX.shape

# **Normal Equation**

Normal Equation is an analytical approach to Linear Regression with a Least Square Cost Function. We can directly find out the value of θ without using Gradient Descent.

In [None]:
# def normal_equation(X, Y):
# #     lam=0.1*np.eye(20)
#     theta = np.dot(np.linalg.inv(np.dot(X.T,X)),np.dot(X.T,Y))
#     return theta

# normal_equation(nX,ny)

# Predicting from Normal Eqn

In [None]:
# beta=normal_equation(nX,ny)
# predict=np.dot(X_test,beta.T)

# RMSE - Root Mean Squared Error

In [None]:
from sklearn.metrics import mean_squared_error
rmse=mean_squared_error(y_test, pred,squared=False)
rmse

## In this notebook, I am using a different approach. I will use Concepts like 
1. **Feature Engineering with - LabelEncoder, SimpleImputer, BOX-COX Transfomtion, IsolationForest, RobustScalar**
2. **Feature Selection with - SelectKBest**
3. **Normal Equation**
4. **RMLSE evaluation**
5. **Feature Importance with - SHAP Values**

## For more traditional approach, (EDA + Excellent Data Viz + XGBoost HyperParameter Tuning + Feature Engineering Tutorial)
you may visit:-[https://www.kaggle.com/code/pythonkumar/xgboost-hyperparameters-excellent-plots-acc-91?kernelSessionId=94478268]

# **Feature Importance - Shap Values**

The goal of SHAP is to explain the prediction of an instance x by computing the contribution of each feature to the prediction. 

SHAP values interpret the impact of having a certain value for a given feature in comparison to the prediction we'd make if that feature took some baseline value.

In [None]:
# import shap
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(X)

# # Waterfall Plot
# shap.plots.waterfall(shap_values,max_display=20)

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# Summary Plot
shap.summary_plot(shap_values, features=X, feature_names=X.columns)

In [None]:
import shap
explainer = shap.TreeExplainer(model)

# Get expected value and shap values array
expected_value = explainer.expected_value
shap_array = explainer.shap_values(X)

# Bar Summary Plot
shap.summary_plot(shap_values, features=X, feature_names=X.columns, plot_type='bar')

In [None]:
import shap
explainer = shap.TreeExplainer(model)

# Get expected value and shap values array
expected_value = explainer.expected_value
shap_array = explainer.shap_values(X)

# Descion plot
shap.decision_plot(expected_value, shap_array[0:10],feature_names=list(X.columns))

In [None]:
# import matplotlib.pyplot as plt
# import shap
# explainer = shap.TreeExplainer(model)

# # Get expected value and shap values array
# # expected_value = explainer.expected_valuAe
# shap_array = explainer.shap_values(X)

# # Beeswarm Plot
# shap.plots.beeswarm(shap_values, color=plt.get_cmap("cool"))

In [None]:
# import shap
# explainer = shap.TreeExplainer(model)

# # Get expected value and shap values array
# expected_value = explainer.expected_value
# shap_array = explainer.shap_values(X)

# # Force Plot
# shap.force_plot(expected_value, matplotlib=True)

In [None]:
# import shap
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(X)

# # Heatmap Plot
# shap.plots.heatmap(shap_values, max_display=12)

# Submission

In [None]:
submission=pd.DataFrame({'Id': Id,
                         'SalePrice' : model.predict(test)
                        })
# submission
submission.to_csv('submission.csv', index=False)