In [None]:
import numpy as np 
import pandas as pd

# Load Data

In [None]:
df_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
submision_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [None]:
df_train.head()

In [None]:
integer_columns = df_train.select_dtypes(include=['int64']).columns 
float_columns = df_train.select_dtypes(include=['float64']).columns 
object_columns = df_train.select_dtypes(include=['object']).columns 

In [None]:
print('\nint64 columns:\n', integer_columns) 
print('\nfloat64 columns:\n', float_columns) 
print('\nobject columns:\n', object_columns) 

In [None]:
df_train.describe()

In [None]:
df_train[integer_columns].hist(figsize=(20,20))

In [None]:
df_train[float_columns].hist(figsize=(20,20))

In [None]:
np.log(df_train["LotArea"]).hist()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.pipeline import Pipeline , make_pipeline
from sklearn.compose import ColumnTransformer

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler , OneHotEncoder , OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error


In [None]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier

# Find low corr in dataset

In [None]:
target_column = 'SalePrice'

correlation_matrix = df_train[integer_columns].corr()
low_corr_threshold = 0.2

low_corr_with_target = correlation_matrix[target_column].abs() < low_corr_threshold
low_corr_columns_target = correlation_matrix.columns[low_corr_with_target].tolist()

low_corr_between_columns = (correlation_matrix.abs() < low_corr_threshold) & (correlation_matrix.abs() != 1)
low_corr_columns_between = []
for col in low_corr_between_columns.columns:
    low_corr_columns_between.extend(low_corr_between_columns.index[low_corr_between_columns[col]].tolist())
low_corr_columns_between = list(set(low_corr_columns_between))

low_corr_columns = list(set(low_corr_columns_target + low_corr_columns_between))

# Train test split

In [None]:
X_train = df_train.drop(["SalePrice"] , axis=1)
X_train_corr = df_train.drop(low_corr_columns,axis=1)
y_train = df_train["SalePrice"]

X_test = df_test

# Heatmap

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df_train[integer_columns].corr() , annot=True ,fmt="0.0")

In [None]:
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
])

num_attribs = X_train.select_dtypes(include=['int64' , "float"]).columns
cat_attribs = X_train.select_dtypes(include=["object"]).columns

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

In [None]:
preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

In [None]:
def metric(model,y_train,X_train):
    return "mae",mean_absolute_error(y_train, model.predict(X_train)) ,"mse", mean_squared_error(y_train, model.predict(X_train)) ,"R2", r2_score(y_train, model.predict(X_train))


# Linear Regresion

In [None]:
lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(X_train, y_train)

In [None]:
sample_submission_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = lin_reg.predict(X_test)
sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df.head()

# XGbosst

In [None]:
xgb = Pipeline([
    ("pre" , preprocessing),
    ('xgboost', XGBRegressor(alpha=1))
])

In [None]:
xgb.fit(X_train , y_train)

In [None]:
sample_submission_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = xgb.predict(X_test)
sample_submission_df.to_csv('/kaggle/working/submissionxgb13.csv', index=False)
sample_submission_df.head()

# Randome forest

In [None]:
rfc = Pipeline([
    ("pre" , preprocessing),
    ('xgboost', RandomForestClassifier())
])

In [None]:
rfc.fit(X_train , y_train)

In [None]:
sample_submission_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = rfc.predict(X_test)
sample_submission_df.to_csv('/kaggle/working/submissionrandomforest.csv', index=False)
sample_submission_df.head()

# Evaloate pipline for ordinary encodding and one hot encodding

In [None]:
# Variables suitable for ordinal encoding
ordinal_encoding_vars = ['MSSubClass', 'OverallQual', 'OverallCond']

# Variables suitable for one-hot encoding
one_hot_encoding_vars = [
    'MSZoning', 'Street', 'Alley', 'LandContour', 'Utilities', 'LotConfig', 
    'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 
    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 
    'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical', 
    'GarageType', 'GarageFinish', 'PavedDrive', 'PoolQC', 'Fence', 
    'MiscFeature', 'SaleType', 'SaleCondition'
]


In [None]:
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('ordinal_and_scaler', Pipeline([
            ('ordinal_encoder', OrdinalEncoder()),
            ('standard_scaler', StandardScaler())
        ]), ordinal_encoding_vars),
        ('num', num_pipeline, num_attribs),
        ('cat', cat_pipeline, cat_attribs),
        ('onehot', OneHotEncoder(drop='first'), one_hot_encoding_vars)
    ],
    remainder='passthrough'  # Handle any remaining columns as-is
)

In [None]:
xgb = Pipeline([
    ("pre" , preprocessing_pipeline),
    ('xgboost', XGBRegressor(alpha=1))
])

In [None]:
xgb.fit(X_train , y_train)