In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import warnings

from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer, make_column_transformer

from src.features import preprocessing
from src.models import train_model


warnings.filterwarnings('ignore')

plt.style.use(['seaborn-paper'])
plt.rcParams['font.family'] = 'Arial'

os.getcwd()

def main():

    df=pd.read_csv("train.csv")

    in_features, out_features = preprocessing.make_dataset(df, 'SalePrice')


    feat_names = ["LotArea", 'YearBuilt', '1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd','HouseStyle']

    features = in_features[feat_names]


    preprocess_pipeline = preprocessing.preprocess_pipeline(features)







    model = make_pipeline(preprocess_pipeline, LinearRegression())


    model, prediction, actual = train_model.train_model(features, out_features, model)

    train_model.evaluate_model(predictions,actual, model)

#if __name__ == "__main__":
#   main()

    


def display_df_info(df_name, df, v=False):
    
    print("Shape (rows, cols)={}".format(df.shape))
    print(df.head())
    
    if v:
        
        print(df.info())
        
        
        
        
        

In [8]:
import os
os.getcwd()


'C:\\Users\\user\\AI_SUSS'

In [9]:
house_data =pd.read_csv('train.csv')

In [10]:
house_data.shape

(1460, 81)

In [11]:
print(house_data.dtypes.value_counts())


n_cols = house_data.dtypes[house_data.dtypes == 'object']
c_cols = house_data.dtypes[house_data.dtypes != 'object']

print(' No. of n col is {}'.format(len(n_cols)))
print (' No. of c col is {}'.format(len(c_cols)))


object     43
int64      35
float64     3
dtype: int64
 No. of n col is 43
 No. of c col is 38


In [12]:
house_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [13]:
print(house_data.describe(include ='all'))

                 Id   MSSubClass MSZoning  LotFrontage        LotArea Street  \
count   1460.000000  1460.000000     1460  1201.000000    1460.000000   1460   
unique          NaN          NaN        5          NaN            NaN      2   
top             NaN          NaN       RL          NaN            NaN   Pave   
freq            NaN          NaN     1151          NaN            NaN   1454   
mean     730.500000    56.897260      NaN    70.049958   10516.828082    NaN   
std      421.610009    42.300571      NaN    24.284752    9981.264932    NaN   
min        1.000000    20.000000      NaN    21.000000    1300.000000    NaN   
25%      365.750000    20.000000      NaN    59.000000    7553.500000    NaN   
50%      730.500000    50.000000      NaN    69.000000    9478.500000    NaN   
75%     1095.250000    70.000000      NaN    80.000000   11601.500000    NaN   
max     1460.000000   190.000000      NaN   313.000000  215245.000000    NaN   

       Alley LotShape LandContour Utili

In [17]:
house_data['SalePrice'].describe()


count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [19]:
miss_col = house_data.columns[house_data.isnull().sum() > 0]

print(miss_col)
print(f' \n Total % of missing columns is  {len(miss_col)/house_data.shape[1] * 100:.2f}%')

Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')
 
 Total % of missing columns is  23.46%


In [20]:
print(house_data.isnull().sum() /len(house_data)* 100)


Id                0.000000
MSSubClass        0.000000
MSZoning          0.000000
LotFrontage      17.739726
LotArea           0.000000
                   ...    
MoSold            0.000000
YrSold            0.000000
SaleType          0.000000
SaleCondition     0.000000
SalePrice         0.000000
Length: 81, dtype: float64


In [22]:
miss_percent= house_data.isnull().sum() /len(house_data)* 100
print( miss_percent[miss_percent > 0].sort_values(ascending=False))

PoolQC          99.520548
MiscFeature     96.301370
Alley           93.767123
Fence           80.753425
FireplaceQu     47.260274
LotFrontage     17.739726
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
BsmtExposure     2.602740
BsmtFinType2     2.602740
BsmtFinType1     2.534247
BsmtCond         2.534247
BsmtQual         2.534247
MasVnrArea       0.547945
MasVnrType       0.547945
Electrical       0.068493
dtype: float64


In [24]:
drop_col=['PoolQC','MiscFeature','Alley','Fence','FireplaceQu']

house_data_interim =house_data.drop(columns = drop_col, axis=1)

In [25]:
house_data_interim.shape

(1460, 76)