# Data Wrangling 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Creating loop to transfom data description txt file into list of lists
data_desc_file = '../cs2-housing/documents/data_description.txt'
data_desc_dict = []

with open(data_desc_file) as file:
    for line in file:
        if not line.isspace():
            if not line.startswith(" "):
                new_line = line.split(":")
                data_desc_dict.append([new_line[0].strip(), new_line[1].strip(), ''])
            else:
                data_desc_dict[-1][2] += line.replace('\n', '').replace('\t', ':')
                
# Creating pd dataframe for Data Dictionary, replacing blank space with NaN values
column_names = ['Name', 'Description', 'Notes']            
data_dict_df = pd.DataFrame(data_desc_dict,columns=column_names)
data_dict_df = data_dict_df.replace(r'^\s*$', np.nan, regex=True)
data_dict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         79 non-null     object
 1   Description  79 non-null     object
 2   Notes        46 non-null     object
dtypes: object(3)
memory usage: 2.0+ KB


In [3]:
housing_raw = pd.read_csv('../cs2-housing/rawdata/train.csv')

In [4]:
housing_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
housing_raw.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
# Creating list of columns that should have categorical dtype
dtype_object = ['Id']
for index, row in data_dict_df.iterrows():
    if pd.notnull(row[2]):
        dtype_object.append(row[0])

# Casting correct dtypes for categorical data columns and creating new df for cleaned housing data
housing_dtypes_clean = housing_raw[dtype_object].astype('object')
housing_clean = housing_dtypes_clean.merge(housing_raw).drop_duplicates(keep='first').reset_index(drop=True)

In [7]:
# Combining MoSold and YrSold to new Datetime column YrMoSold, dropping MoSold and YrSold Columns
housing_clean[['YrSold', 'MoSold']] = housing_clean[['YrSold', 'MoSold']].astype('str')
housing_clean['YrMoSold'] = housing_clean['YrSold'].str.cat(housing_clean['MoSold'],sep='-')
housing_clean['YrMoSold'] = pd.to_datetime(housing_clean['YrMoSold'], format='%Y-%m')
housing_clean = housing_clean.drop(labels=['MoSold', 'YrSold'], axis=1)

In [8]:
# Count and percentage of missing columns

missing = pd.concat([housing_clean.isnull().sum(), 100 * housing_clean.isnull().mean().round(4)], axis=1)
missing.columns=['count', '%']
missing_values = missing.sort_values(by=['count', '%'], ascending=False)
cols_missing = missing_values.loc[missing_values['count'] != 0.00]
cols_missing

Unnamed: 0,count,%
PoolQC,1453,99.52
MiscFeature,1406,96.3
Alley,1369,93.77
Fence,1179,80.75
FireplaceQu,690,47.26
LotFrontage,259,17.74
GarageType,81,5.55
GarageFinish,81,5.55
GarageQual,81,5.55
GarageCond,81,5.55


In [9]:
# Lowest sales prices
lowest_final_price = housing_clean.loc[housing_raw.SalePrice < 50000]
lowest_final_price

Unnamed: 0,Id,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SalePrice,YrMoSold
30,31,70,C (all),Pave,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,250,0,54,172,0,0,0,0,40000,2008-07-01
495,496,30,C (all),Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,523,115,0,0,0,0,34900,2009-11-01
533,534,20,RL,Pave,,Reg,Low,AllPub,Inside,Mod,...,0,0,0,0,0,0,0,0,39300,2007-01-01
916,917,20,C (all),Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,308,0,0,0,0,0,0,0,35311,2006-10-01
968,969,50,RM,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,0,0,37900,2009-05-01


In [10]:
# Miscellaneous features and their monetary values
housing_clean[['MiscFeature', 'MiscVal']].dropna()

Unnamed: 0,MiscFeature,MiscVal
5,Shed,700
7,Shed,350
16,Shed,700
17,Shed,500
51,Shed,400
84,Shed,700
95,Shed,480
98,Shed,400
99,Shed,400
106,Shed,450


In [11]:
# Saving cleaned DFs to new csv files
data_dict_df.to_csv('../cs2-housing/documents/data_dictionary.csv')
housing_clean.to_csv('../cs2-housing/rawdata/cleaned_data.csv')

Notes:
- Raw data: 1460 Rows, 80 features
- Clean data: 1460 Rows, 79 features
- dtypes: datetime64(1), float64(3), int64(29), object(47)
- All final sales prices available