## Exploratory data analysis

Read in and analyse training data, then save important information to JSON file for later use in feature engineering and model training.

In [52]:
# Imports

import pandas as pd


In [53]:
# read csv into dataframe
data = pd.read_csv('../data/raw/train.csv')
data.shape

(1460, 81)

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [55]:
data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [56]:
info = {}
info['columns'] = data.columns
info['data_types'] = data.dtypes 

In [57]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


In [58]:
info['missing_values'] = missing_values
rows_with_missing_values = data[data.isnull().any(axis=1)]
info['rows_with_missing_values'] = rows_with_missing_values
rows_with_missing_values.shape

(1460, 81)

In [59]:
# Number of cols with missing values
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]
num_cols_missing_vals = missing_values.shape[0]
info['num_cols_missing_vals'] = num_cols_missing_vals
num_cols_missing_vals

19

### Data Editing

In [60]:
# Convert all non-numerical data to categorical data
# Select non-numerical columns
non_numerical_cols = data.select_dtypes(exclude=['int64', 'float64']).columns

# Convert non-numerical columns to one-hot encoding
one_hot_data = pd.get_dummies(data, columns=non_numerical_cols)
print(data.dtypes)
# save to csv
data.to_csv('../data/processed/train_one-hot-encoded.csv', index=False)

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object


In [61]:
type(data['Alley'][0])

float

In [62]:
# Create copy of the dataset which removes columns with missing values
data_copy = data.dropna(axis=1)
# save the data to csv
data_copy.to_csv('../data/processed/train_no_missing_vals_cols_removed.csv', index=False)
data_copy.shape

(1460, 62)

In [63]:
# Create copy of the dataset which replaces missing values with the column mean
data_copy = data.fillna(data.mean())
# save the data to csv
data_copy.to_csv('../data/processed/train_missing_vals_replaced_with_mean.csv', index=False)

TypeError: unsupported operand type(s) for +: 'int' and 'str'