In [589]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import tensorflow
from keras.models import Sequential
from keras.layers import Dense,BatchNormalization,LayerNormalization,SimpleRNN



In [590]:
def wrangle(filepath):
    df=pd.read_csv(filepath)
    # Subset data: Remove outliers for " # Subset data: Remove outliers for "LotArea"
    low, high = df["LotArea"].quantile([0.1, 0.9])
    mask_area = df["LotArea"].between(low, high)
    df = df[mask_area]
    #removing missing columns with 50% or more missing values
    null_percentage = df.isnull().mean() * 100
    columns_to_drop = null_percentage[null_percentage > 50].index
    df = df.drop(columns_to_drop, axis=1)
    #solving multicolinearity
    low_cardinality_threshold = 10
    high_cardinality_threshold = len(df) * 0.6
    unique_counts = df.select_dtypes(include='object').nunique()
    low_cardinality_columns = unique_counts[unique_counts < low_cardinality_threshold].index
    high_cardinality_columns = unique_counts[unique_counts > high_cardinality_threshold].index
    df = df.drop(low_cardinality_columns, axis=1)
    df = df.drop(high_cardinality_columns, axis=1)
    #droping missing values in rows
    df=  df.dropna(axis=0)
    df=df.drop(columns='Id')
    return df
                      

In [591]:
df=wrangle('C:/Users/knigh/Downloads/house-prices-advanced-regression-techniques/train.csv')

In [592]:
def wrangle2(filepath):
    df=pd.read_csv(filepath)
    # Subset data: Remove outliers for " # Subset data: Remove outliers for "LotArea"
    #removing missing columns with 50% or more missing values
   
    #solving multicolinearity
    low_cardinality_threshold = 10
    high_cardinality_threshold = len(df) * 0.6
    unique_counts = df.select_dtypes(include='object').nunique()
    low_cardinality_columns = unique_counts[unique_counts < low_cardinality_threshold].index
    high_cardinality_columns = unique_counts[unique_counts > high_cardinality_threshold].index
    df = df.drop(low_cardinality_columns, axis=1)
    df = df.drop(high_cardinality_columns, axis=1)
    null_percentage = df.isnull().mean() * 100
    columns_to_drop = null_percentage[null_percentage > 50].index
    df = df.drop(columns_to_drop, axis=1)
    #droping missing values in rows
    df=df.fillna(method='ffill')
    df=df.drop(columns='Id')
    return df

In [593]:
df2=wrangle2('C:/Users/knigh/Downloads/house-prices-advanced-regression-techniques/test.csv')

In [594]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 921 entries, 0 to 1459
Data columns (total 40 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     921 non-null    int64  
 1   LotFrontage    921 non-null    float64
 2   LotArea        921 non-null    int64  
 3   Neighborhood   921 non-null    object 
 4   OverallQual    921 non-null    int64  
 5   OverallCond    921 non-null    int64  
 6   YearBuilt      921 non-null    int64  
 7   YearRemodAdd   921 non-null    int64  
 8   Exterior1st    921 non-null    object 
 9   Exterior2nd    921 non-null    object 
 10  MasVnrArea     921 non-null    float64
 11  BsmtFinSF1     921 non-null    int64  
 12  BsmtFinSF2     921 non-null    int64  
 13  BsmtUnfSF      921 non-null    int64  
 14  TotalBsmtSF    921 non-null    int64  
 15  1stFlrSF       921 non-null    int64  
 16  2ndFlrSF       921 non-null    int64  
 17  LowQualFinSF   921 non-null    int64  
 18  GrLivArea     

In [595]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
def preprocess_data(df):
    # Separate the target variable (SalePrice) from the features
    X = df.drop(columns='SalePrice')
    y=df['SalePrice']   

    # Define the columns as per your dataset where categorical and numerical columns are located
    categorical_features = []
    numerical_columns = []
    
    for column in df.columns:
        if df[column].dtype == 'object': 
            categorical_features.append(column)
        elif df[column].dtype in ['int', 'float']:
            numerical_columns.append(column)
    
    # Create transformers for encoding categorical columns and standardizing numerical columns
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    # Create a column transformer to apply different preprocessing to different columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features),
            ('num', numerical_transformer, numerical_columns)
        ]
    )
    x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
    # Apply the preprocessing to fit the data
    train_X_preprocessed = preprocessor.fit_transform(x_train),
    test_x_preprocessed=preprocessor.transform(x_test)

    return train_X_preprocessed,test_x_preprocessed,y_train.values.reshape(-1,1),y_test.values.reshape(-1,1),preprocessor
     

# Assuming you have separate training and test CSV files named 'train.csv' and 'test.csv'

# Preprocess the training data

# Use the preprocessed data for further modeling or analysis
# X_train_preprocessed contains the preprocessed training features
# y_train contains the target variable of the training data
# X_test_preprocessed contains the preprocessed test features



In [596]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 39 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   LotFrontage    1459 non-null   float64
 2   LotArea        1459 non-null   int64  
 3   Neighborhood   1459 non-null   object 
 4   OverallQual    1459 non-null   int64  
 5   OverallCond    1459 non-null   int64  
 6   YearBuilt      1459 non-null   int64  
 7   YearRemodAdd   1459 non-null   int64  
 8   Exterior1st    1459 non-null   object 
 9   Exterior2nd    1459 non-null   object 
 10  MasVnrArea     1459 non-null   float64
 11  BsmtFinSF1     1459 non-null   float64
 12  BsmtFinSF2     1459 non-null   float64
 13  BsmtUnfSF      1459 non-null   float64
 14  TotalBsmtSF    1459 non-null   float64
 15  1stFlrSF       1459 non-null   int64  
 16  2ndFlrSF       1459 non-null   int64  
 17  LowQualFinSF   1459 non-null   int64  
 18  GrLivAre

In [597]:
x_train,x_test,y_train,y_test,preprosessor=preprocess_data(df)




In [598]:
x=preprosessor.transform(df2)

In [599]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

In [600]:
from tensorflow.keras import regularizers
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(patience=5, restore_best_weights=True)
model=Sequential([
Dense(units=53,activation='relu',input_shape=[53],kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01)),
BatchNormalization(),
LayerNormalization(),
Dense(units=53,activation='relu',kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01)),
Dense(units=1)
])

In [601]:
model.compile(optimizer='adam',loss='mean_squared_error',metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [602]:
model.fit(x_train,y_train,epochs=500,batch_size=32,validation_data=(x_test,y_test),callbacks=[early_stopping])

Epoch 1/500


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 7

<keras.callbacks.History at 0x20c4860a850>

In [603]:
model.summary()

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_78 (Dense)            (None, 53)                2862      
                                                                 
 batch_normalization_21 (Bat  (None, 53)               212       
 chNormalization)                                                
                                                                 
 layer_normalization_16 (Lay  (None, 53)               106       
 erNormalization)                                                
                                                                 
 dense_79 (Dense)            (None, 53)                2862      
                                                                 
 dense_80 (Dense)            (None, 1)                 54        
                                                                 
Total params: 6,096
Trainable params: 5,990
Non-train

In [604]:
y_pred = model.predict(x_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[214299.09 215000.  ]
 [240016.97 217500.  ]
 [144363.   174900.  ]
 [316194.84 250000.  ]
 [143759.36 167000.  ]
 [158901.41 118964.  ]
 [242993.86 221500.  ]
 [205078.41 185850.  ]
 [140162.25 127500.  ]
 [126441.6  109000.  ]
 [168232.48 158500.  ]
 [148964.92 174000.  ]
 [175467.8  145000.  ]
 [163954.34 197500.  ]
 [ 95431.86 129500.  ]
 [139219.84 150000.  ]
 [117495.05 161000.  ]
 [133721.73 118500.  ]
 [131210.23 130000.  ]
 [181232.17 173000.  ]
 [138195.59 130000.  ]
 [245546.94 263435.  ]
 [138876.53 132000.  ]
 [148870.61 125000.  ]
 [179117.28 169000.  ]
 [202563.27 239500.  ]
 [240280.98 253000.  ]
 [137575.16 128000.  ]
 [190201.02 156932.  ]
 [296045.75 312500.  ]
 [121958.93 108000.  ]
 [302250.28 325624.  ]
 [169857.83 163500.  ]
 [186556.52 153337.  ]
 [262597.   254000.  ]
 [141144.72 148000.  ]
 [134147.41  94750.  ]
 [181310.75 189000.  ]
 [116148.35 107000.  ]
 [265166.59 236000.  ]
 [183769.27 215000.  ]
 [312105.22 315750.  ]
 [141012.97 155000.  ]
 [209241.22

In [605]:
prediction= model.predict(x)

 1/46 [..............................] - ETA: 0s



In [606]:
prediction

array([[133318.78],
       [142211.55],
       [185431.08],
       ...,
       [167617.44],
       [137860.22],
       [167843.9 ]], dtype=float32)

In [607]:
df2['SalePrice']=prediction

In [608]:
df3=pd.read_csv('C:/Users/knigh/Downloads/house-prices-advanced-regression-techniques/test.csv')

In [609]:
df2['Id']=df3['Id']

In [610]:
df2

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,Exterior2nd,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,Id
0,20,80.0,11622,NAmes,5,6,1961,1961,VinylSd,VinylSd,...,0,0,0,120,0,0,6,2010,133318.781250,1461
1,20,81.0,14267,NAmes,6,6,1958,1958,Wd Sdng,Wd Sdng,...,36,0,0,0,0,12500,6,2010,142211.546875,1462
2,60,74.0,13830,Gilbert,5,5,1997,1998,VinylSd,VinylSd,...,34,0,0,0,0,0,3,2010,185431.078125,1463
3,60,78.0,9978,Gilbert,6,6,1998,1998,VinylSd,VinylSd,...,36,0,0,0,0,0,6,2010,188255.406250,1464
4,120,43.0,5005,StoneBr,8,5,1992,1992,HdBoard,HdBoard,...,82,0,0,144,0,0,1,2010,202364.093750,1465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,21.0,1936,MeadowV,4,7,1970,1970,CemntBd,CmentBd,...,0,0,0,0,0,0,6,2006,160548.421875,2915
1455,160,21.0,1894,MeadowV,4,5,1970,1970,CemntBd,CmentBd,...,24,0,0,0,0,0,4,2006,160548.421875,2916
1456,20,160.0,20000,Mitchel,5,7,1960,1996,VinylSd,VinylSd,...,0,0,0,0,0,0,9,2006,167617.437500,2917
1457,85,62.0,10441,Mitchel,5,5,1992,1992,HdBoard,Wd Shng,...,32,0,0,0,0,700,7,2006,137860.218750,2918


In [611]:
df3

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [612]:
df2=pd.DataFrame({'Id': df2['Id'], 'SalePrice': df2['SalePrice']})

In [613]:
df2

Unnamed: 0,Id,SalePrice
0,1461,133318.781250
1,1462,142211.546875
2,1463,185431.078125
3,1464,188255.406250
4,1465,202364.093750
...,...,...
1454,2915,160548.421875
1455,2916,160548.421875
1456,2917,167617.437500
1457,2918,137860.218750


In [615]:
df2.to_csv('submission.csv', index=False)