In [1]:
# Import usual library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read training dataset into df
df = pd.read_csv('./datasets/train.csv')

In [3]:
# View head
df.head(2)

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,534,531363010,20,RL,80.0,9605,Pave,,Reg,Lvl,...,0,,,,0,4,2009,WD,Normal,159000
1,803,906203120,20,RL,90.0,14684,Pave,,IR1,Lvl,...,0,,,,0,6,2009,WD,Normal,271900


In [4]:
# Check info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197 entries, 0 to 2196
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2197 non-null   int64  
 1   PID              2197 non-null   int64  
 2   MS SubClass      2197 non-null   int64  
 3   MS Zoning        2197 non-null   object 
 4   Lot Frontage     1835 non-null   float64
 5   Lot Area         2197 non-null   int64  
 6   Street           2197 non-null   object 
 7   Alley            143 non-null    object 
 8   Lot Shape        2197 non-null   object 
 9   Land Contour     2197 non-null   object 
 10  Utilities        2197 non-null   object 
 11  Lot Config       2197 non-null   object 
 12  Land Slope       2197 non-null   object 
 13  Neighborhood     2197 non-null   object 
 14  Condition 1      2197 non-null   object 
 15  Condition 2      2197 non-null   object 
 16  Bldg Type        2197 non-null   object 
 17  House Style   

In [5]:
df.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [6]:
# Create a dictionary to rename columns
col_name_dict = {'Order':'id', 'PID':'pid', 'MS SubClass':'ms_subclass', 'MS Zoning':'ms_zoning',
                 'Lot Frontage':'lot_frontage', 'Lot Area':'lot_area','Street':'street','Alley':'alley',
                 'Lot Shape':'lot_shape', 'Land Contour':'land_contour', 'Utilities':'utilities',
                 'Lot Config':'lot_config', 'Land Slope':'land_slope', 'Neighborhood':'neighborhood',
                 'Condition 1':'condition_1','Condition 2':'condition_2', 'Bldg Type':'building_type',
                 'House Style':'house_style', 'Overall Qual':'overall_quality','Overall Cond':'overall_condition',
                 'Year Built':'year_built', 'Year Remod/Add':'year_remodel_add', 'Roof Style':'roof_style',
                 'Roof Matl':'roof_material', 'Exterior 1st':'exterior_1', 'Exterior 2nd':'exterior_2',
                 'Mas Vnr Type':'masonry_type','Mas Vnr Area':'masonry_area', 'Exter Qual':'external_quality',
                 'Exter Cond':'external_condition', 'Foundation':'foundation', 'Bsmt Qual':'basement_quality',
                 'Bsmt Cond':'basement_condition', 'Bsmt Exposure':'basement_exposure',
                 'BsmtFin Type 1':'basement_fin_type_1', 'BsmtFin SF 1':'basement_fin_sf_1',
                 'BsmtFin Type 2':'basement_fin_type_2', 'BsmtFin SF 2':'basement_fin_sf_2',
                 'Bsmt Unf SF':'basement_unf_sf', 'Total Bsmt SF':'total_basement_sf','Heating':'heating',
                 'Heating QC':'heating_qc', 'Central Air':'central_air', 'Electrical':'electrical',
                 '1st Flr SF':'1st_floor_sf','2nd Flr SF':'2nd_floor_sf', 'Low Qual Fin SF':'low_quality_fin_sf',
                 'Gr Liv Area':'ground_living_area', 'Bsmt Full Bath':'basement_full_bath',
                 'Bsmt Half Bath':'basement_half_bath', 'Full Bath':'full_bath', 'Half Bath':'half_bath',
                 'Bedroom AbvGr':'bed_above_ground','Kitchen AbvGr':'kitchen_above_ground',
                 'Kitchen Qual':'kitchen_quality', 'TotRms AbvGrd':'total_rooms_above_ground', 
                 'Functional':'functional','Fireplaces':'fireplace', 'Fireplace Qu':'fireplace_quality',
                 'Garage Type':'garage_type', 'Garage Yr Blt':'garage_year_built','Garage Finish':'garage_fin',
                 'Garage Cars':'garage_cars', 'Garage Area':'garage_area', 'Garage Qual':'garage_quality',
                 'Garage Cond':'garage_condition', 'Paved Drive':'paved_drive', 'Wood Deck SF':'wood_deck_sf',
                 'Open Porch SF':'open_porch_df','Enclosed Porch':'enclosed_porch', '3Ssn Porch':'3ssn_porch',
                 'Screen Porch':'screen_porch', 'Pool Area':'pool_area', 'Pool QC':'pool_quality','Fence':'fence',
                 'Misc Feature':'misc_feature', 'Misc Val':'misc_value', 'Mo Sold':'month_sold',
                 'Yr Sold':'year_sold', 'Sale Type':'sale_type','SalePrice':'sale_price'}

In [7]:
# rename columns
df = df.rename(columns=col_name_dict)

In [8]:
df.shape

(2197, 82)

In [9]:
df.head(3)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,pool_area,pool_quality,fence,misc_feature,misc_value,month_sold,year_sold,sale_type,Sale Condition,sale_price
0,534,531363010,20,RL,80.0,9605,Pave,,Reg,Lvl,...,0,,,,0,4,2009,WD,Normal,159000
1,803,906203120,20,RL,90.0,14684,Pave,,IR1,Lvl,...,0,,,,0,6,2009,WD,Normal,271900
2,956,916176030,20,RL,,14375,Pave,,IR1,Lvl,...,0,,,,0,1,2009,COD,Abnorml,137500


In [10]:
# Check nulls
df.isnull().mean()[df.isnull().sum() > 0].sort_values(ascending=False)

pool_quality           0.994538
misc_feature           0.963587
alley                  0.934911
fence                  0.809285
fireplace_quality      0.485207
lot_frontage           0.164770
garage_condition       0.055530
garage_quality         0.055530
garage_fin             0.055530
garage_year_built      0.055530
garage_type            0.054620
basement_exposure      0.031406
basement_fin_type_2    0.030951
basement_fin_type_1    0.030496
basement_quality       0.030496
basement_condition     0.030496
masonry_area           0.010014
masonry_type           0.010014
basement_half_bath     0.000455
basement_full_bath     0.000455
total_basement_sf      0.000455
basement_unf_sf        0.000455
garage_cars            0.000455
garage_area            0.000455
basement_fin_sf_2      0.000455
basement_fin_sf_1      0.000455
electrical             0.000455
dtype: float64

In [11]:
# Drop almost empty columns
df = df.drop(['pool_quality','alley','misc_feature','fence'], axis=1)

In [12]:
# Fill categorical column with mode

#df['masonry_area'].fillna(df['masonry_area'].mode()[0],inplace=True)
#df['masonry_type'].fillna(df['masonry_type'].mode()[0],inplace=True)
#df['lot_frontage'].fillna(df['lot_frontage'].mode()[0],inplace=True)
#df['fireplace_quality'].fillna(df['fireplace_quality'].mode()[0],inplace=True)
#df['garage_type'].fillna(df['garage_type'].mode()[0],inplace=True)
#df['garage_year_built'].fillna(df['garage_year_built'].mode()[0],inplace=True)
#df['garage_fin'].fillna(df['garage_fin'].mode()[0],inplace=True)
#df['garage_quality'].fillna(df['garage_quality'].mode()[0],inplace=True)
#df['garage_condition'].fillna(df['garage_condition'].mode()[0],inplace=True)
#df['basement_quality'].fillna(df['basement_quality'].mode()[0],inplace=True)
#df['basement_condition'].fillna(df['basement_condition'].mode()[0],inplace=True)
#df['basement_exposure'].fillna(df['basement_exposure'].mode()[0],inplace=True)
#df['basement_fin_type_1'].fillna(df['basement_fin_type_1'].mode()[0],inplace=True)
#df['basement_fin_type_2'].fillna(df['basement_fin_type_2'].mode()[0],inplace=True)

In [13]:
# Fill categorical column with NA

df['masonry_area'].fillna(0,inplace=True)
df['masonry_type'].fillna('NA',inplace=True)
df['lot_frontage'].fillna(0,inplace=True)
df['fireplace_quality'].fillna('NA',inplace=True)
df['garage_type'].fillna('NA',inplace=True)
df['garage_year_built'].fillna('NA',inplace=True)
df['garage_fin'].fillna('NA',inplace=True)
df['garage_quality'].fillna('NA',inplace=True)
df['garage_condition'].fillna('NA',inplace=True)
df['basement_quality'].fillna('NA',inplace=True)
df['basement_condition'].fillna('NA',inplace=True)
df['basement_exposure'].fillna('NA',inplace=True)
df['basement_fin_type_1'].fillna('NA',inplace=True)
df['basement_fin_type_2'].fillna('NA',inplace=True)

In [14]:
df = df[df['basement_fin_sf_1'].notnull()]

In [15]:
df = df[df['garage_cars'].notnull()]

In [16]:
df = df[df['electrical'].notnull()]

In [17]:
# df.to_csv('./datasets/train_cleaned_mode.csv', index=False)

In [18]:
df.to_csv('./datasets/train_cleaned_NA.csv', index=False)

----