In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

In [None]:
X_train = pd.read_csv('house-prices/train.csv',usecols=['MSZoning', 'LotFrontage', 'LotArea', 'BldgType', 'OverallQual', 'OverallCond', 'YearBuilt', 'Foundation', 'BsmtCond', 'TotalBsmtSF', '1stFlrSF' ,'2ndFlrSF' , 'GrLivArea' , 'KitchenQual' ,'TotRmsAbvGrd' , 'GarageType' , 'GarageCars' , 'GarageArea' , 'GarageQual' , 'GarageCond' , 'PoolArea' ,  'MiscVal' , 'SaleType' ,  'SaleCondition' ,'RoofStyle', 'RoofMatl' , 'Exterior1st', 'Exterior2nd' , 'ExterQual', 'ExterCond' , 'BsmtQual', 'CentralAir', 'YrSold'])
X_train

In [None]:
y_train = pd.read_csv('house-prices/train.csv',usecols=['SalePrice'])

In [None]:
X_test = pd.read_csv('house-prices/test.csv',usecols=['MSZoning', 'LotFrontage', 'LotArea', 'BldgType', 'OverallQual', 'OverallCond', 'YearBuilt', 'Foundation', 'BsmtCond', 'TotalBsmtSF', '1stFlrSF' ,'2ndFlrSF' , 'GrLivArea' , 'KitchenQual' ,'TotRmsAbvGrd' , 'GarageType' , 'GarageCars' , 'GarageArea' , 'GarageQual' , 'GarageCond' , 'PoolArea' ,  'MiscVal' , 'SaleType' ,  'SaleCondition' ,'RoofStyle', 'RoofMatl' , 'Exterior1st', 'Exterior2nd' , 'ExterQual', 'ExterCond' , 'BsmtQual', 'CentralAir', 'YrSold'])
X_train

In [None]:
y_test = pd.read_csv('house-prices/sample_submission.csv',usecols=['SalePrice'])

In [None]:
X_train['YearBuilt'] = pd.to_datetime(X_train['YearBuilt']).dt.year
X_test['YearBuilt'] = pd.to_datetime(X_test['YearBuilt']).dt.year
X_train['YrSold'] = pd.to_datetime(X_train['YrSold']).dt.year
X_test['YrSold'] = pd.to_datetime(X_test['YrSold']).dt.year

In [None]:
# removable_col = [x for x in X_train.columns if (X_train[x].isnull().sum() > 0)]
# removable_col

In [None]:
# df = df[removable_col].dropna()

In [None]:
# X = df.iloc[:,:-1]
# y = df.iloc[:,-1]

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=46)

In [None]:
# X_train

In [None]:
categoricalColumns = ['MSZoning', 'BldgType',  'Foundation', 'BsmtCond',   'KitchenQual' , 'GarageType' ,  'GarageQual' , 'GarageCond' ,  'SaleType' ,  'SaleCondition' ,'RoofStyle', 'RoofMatl' , 'Exterior1st', 'Exterior2nd' , 'ExterQual', 'ExterCond' , 'BsmtQual', 'CentralAir']

# cat_impute = ['BsmtCond','GarageType','GarageQual' , 'GarageCond' , 'BsmtQual']

ordinalColumns = ['BsmtCond', 'KitchenQual' ,'GarageQual' , 'GarageCond' ,'ExterQual', 'ExterCond' , 'BsmtQual']
oneHotColumns = ['MSZoning', 'BldgType',  'Foundation',    'GarageType' ,    'SaleType' ,  'SaleCondition' ,'RoofStyle', 'RoofMatl' , 'Exterior1st', 'Exterior2nd' ,  'CentralAir']

In [None]:
numaricalColumns = ['LotFrontage','OverallQual', 'OverallCond', 'LotArea','YearBuilt','YrSold', 'TotalBsmtSF', '1stFlrSF' ,'2ndFlrSF' ,'GrLivArea' ,'TotRmsAbvGrd' , 'GarageCars' , 'GarageArea' ,'PoolArea' , 'MiscVal']

# num_std= ['OverallQual', 'OverallCond', 'LotArea','YearBuilt','YrSold', 'TotalBsmtSF', '1stFlrSF' ,'2ndFlrSF' ,'GrLivArea' ,'TotRmsAbvGrd' , 'GarageCars' , 'GarageArea' ,'PoolArea' , 'MiscVal']

# num_im_st = ['LotFrontage']

In [None]:
# X_train.columns

In [None]:
# numaricalColumns


# Impute and Scalling

In [None]:
numerical_transfromer = Pipeline(
    steps=[
        ("num_imputer",SimpleImputer(strategy='mean')),
        ("numScaler",StandardScaler())
    ]
)


categorical_ordi_transformer = Pipeline(
    steps=[
        ("cat_imputer1",SimpleImputer(strategy='most_frequent')),
        ("ord_encod",OrdinalEncoder(categories=[
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex']
        ]))
    ]
)

categorical_oneHot_transformer = Pipeline(
    steps=[
        ("cat_imputer1",SimpleImputer(strategy='most_frequent')),
        ("ohe_encod",OneHotEncoder(handle_unknown='ignore'))
    ]
)


# Using transformer

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('imp_stand',numerical_transfromer,numaricalColumns),
        ('cat_ordi',categorical_ordi_transformer,ordinalColumns),
        ('cat_oneHot',categorical_oneHot_transformer,oneHotColumns)
    ],remainder='passthrough'    
)

In [None]:
# clf = Pipeline(
#     steps=[
#         ('preprocessor',preprocessor),
#         ('classifier',LinearRegression())
#     ]
# )

In [None]:
# clf.fit(X_train,X_test)
X_train_trf = preprocessor.fit_transform(X_train)
X_test_trf = preprocessor.transform(X_test)


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)

X_train_trf = pca.fit_transform(X_train_trf)
X_test_trf = pca.transform(X_test_trf)

In [None]:
lr = LinearRegression()

lr.fit(X_train_trf,y_train)

In [None]:

y_pred = lr.predict(X_test_trf)

In [None]:

r2_score(y_test,y_pred)

In [None]:
# import plotly.express as px
# 
# y_train_trf = y_train.astype(str)
# fig = px.scatter_3d(df, x=X_train_trf[:, 0], y=X_train_trf[:, 1], z=X_train_trf[:, 2],
#                     color=y_train_trf)
# fig.update_layout(
#     margin=dict(l=20, r=20, t=20, b=20),
#     paper_bgcolor="LightSteelBlue",
# )
# fig.show()

In [None]:
pd.DataFrame(X_test).isnull().mean()

In [None]:
np.__version__