# Data Wrangling 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
housing_raw = pd.read_csv('../cs2-housing/rawdata/train.csv')

In [3]:
housing_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
housing_raw.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
missing = pd.concat([housing_raw.isnull().sum(), 100 * housing_raw.isnull().mean().round(4)], axis=1)
missing.columns=['count', '%']
missing_values = missing.sort_values(by=['count', '%'], ascending=False)
cols_missing = missing_values.loc[missing_values['count'] != 0.00]
cols_missing

Unnamed: 0,count,%
PoolQC,1453,99.52
MiscFeature,1406,96.3
Alley,1369,93.77
Fence,1179,80.75
FireplaceQu,690,47.26
LotFrontage,259,17.74
GarageType,81,5.55
GarageYrBlt,81,5.55
GarageFinish,81,5.55
GarageQual,81,5.55


In [6]:
# Lowest sales prices
lowest_final_price = housing_raw.loc[housing_raw.SalePrice < 50000]
lowest_final_price

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
30,31,70,C (all),50.0,8500,Pave,Pave,Reg,Lvl,AllPub,...,0,,MnPrv,,0,7,2008,WD,Normal,40000
495,496,30,C (all),60.0,7879,Pave,,Reg,Lvl,AllPub,...,0,,GdWo,,0,11,2009,WD,Abnorml,34900
533,534,20,RL,50.0,5000,Pave,,Reg,Low,AllPub,...,0,,,,0,1,2007,WD,Normal,39300
916,917,20,C (all),50.0,9000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,10,2006,WD,Abnorml,35311
968,969,50,RM,50.0,5925,Pave,,Reg,Lvl,AllPub,...,0,,GdWo,,0,5,2009,WD,Abnorml,37900


In [7]:
# Miscellaneous features and their monetary values
housing_raw[['MiscFeature', 'MiscVal']].dropna()

Unnamed: 0,MiscFeature,MiscVal
5,Shed,700
7,Shed,350
16,Shed,700
17,Shed,500
51,Shed,400
84,Shed,700
95,Shed,480
98,Shed,400
99,Shed,400
106,Shed,450


In [8]:
# Creating loop to read txt for data dictionary
data_desc_file = '../cs2-housing/documents/data_description.txt'
data_desc_dict = []

with open(data_desc_file) as file:
    for line in file:
        if not line.isspace():
            if not line.startswith(" "):
                new_line = line.split(":")
                data_desc_dict.append([new_line[0].strip(), new_line[1].strip(), ''])
            else:
                data_desc_dict[-1][2] += line.replace('\n', '').replace('\t', ':')
                
# Creating dataframe for data dictionary
column_names = ['Name of column', 'Description of column', 'Notes']            
data_dict_df = pd.DataFrame(data_desc_dict,columns=column_names)
data_dict_df

Unnamed: 0,Name of column,Description of column,Notes
0,MSSubClass,Identifies the type of dwelling involved in th...,20:1-STORY 1946 & NEWER ALL STYLES ...
1,MSZoning,Identifies the general zoning classification o...,A:Agriculture C:Commercial ...
2,LotFrontage,Linear feet of street connected to property,
3,LotArea,Lot size in square feet,
4,Street,Type of road access to property,Grvl:Gravel: Pave:Paved
...,...,...,...
74,MiscVal,$Value of miscellaneous feature,
75,MoSold,Month Sold (MM),
76,YrSold,Year Sold (YYYY),
77,SaleType,Type of sale,WD :Warranty Deed - Conventional ...


In [9]:
# Saving cleaned DFs to new csv files
data_dict_df.to_csv('../cs2-housing/documents/data_description.csv')

Notes:
- 1460 Properties, 79 features
- dtypes: float64(3), int64(35), object(43)
- All final sales prices available