In [344]:
##############################################################################
#####################################################################
#####################################################################
#
# DATA CLEANING: 
#                 null-values and object dtypes
#
#####################################################################
##############################################################################

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from numpy import arange
import math

import seaborn as sns
sns.set_style("white")

import scipy as sp
from scipy import linalg, optimize

import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import collections
from collections import Counter

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.path as mpath
from matplotlib.dates import DateFormatter

import calendar
import datetime as dt
fromtimestamp = dt.datetime.fromtimestamp

 # importing raw input data
raw = pd.read_csv("../data/train.csv")
    # confirm
raw.head(1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500


In [346]:
# safety-first:

    # copy df into w(orking)df:
wdf = raw.copy()
    # make sure vals in col"id" are all unique:
len(wdf['Id'].unique())==len(wdf)

True

In [347]:
# add date col:

wdf = wdf.assign(date=wdf['MoSold'].astype(str)+"-"+wdf['YrSold'].astype(str))
    # translate to datetime + update df 
wdf['date'] = pd.to_datetime(wdf.date)
        # rename important cols with short and easy names + sort by date, price
wdf = wdf.rename(columns={'Id':'id','SalePrice':'price', 'YrSold':'year', 'MoSold':'month'}).sort_values(['date','price']) 

In [348]:
# add constant:

wdf = sm.add_constant(wdf)

# NA Values

In [349]:
# null values:

    # storing 50% len of raw df 
halfdf = len(raw)/2

    # new df for count of na-vals
nasumdf = pd.DataFrame(wdf.isna().sum())
nasumdf = nasumdf.reset_index().rename(columns={'index': 'col',0:'sum_of_na'}).sort_values('col')
nasumdf = nasumdf.loc[nasumdf['sum_of_na']>0].reset_index()
    
    # new df for all cols with > 50% na
naover50 = nasumdf.loc[nasumdf['sum_of_na']>halfdf]
    # review:
naover50

Unnamed: 0,index,col,sum_of_na
0,7,Alley,1369
7,74,Fence,1179
17,75,MiscFeature,1406
18,73,PoolQC,1453


In [350]:
    # new df to store col with > 25% na vals
naover25 = nasumdf.loc[nasumdf['sum_of_na']>halfdf/2]
    #review
naover25

Unnamed: 0,index,col,sum_of_na
0,7,Alley,1369
7,74,Fence,1179
8,58,FireplaceQu,690
17,75,MiscFeature,1406
18,73,PoolQC,1453


In [351]:
##############################################################################
###  NA HANDELING:
#
# ⟹ observation: 4 cols contain over 50% na vals, 1 over 25%
#    ⟹ conclusion: info value not not sufficient for further analysis 
#    ⟹ decision: drop cols Alley, Fence, MiscFeature, PoolQC
# ⟹ observation: 14 more cols need further investigation
#    ⟹ next step: cols saved in 'investigate' value inspection
#
##############################################################################

    # saving cols to drop in list
colstodrop = naover25['col'].to_list()

    # drop and update wdf
wdf = wdf.drop(columns=[c for c in colstodrop])

In [352]:
nasumdf2 = pd.DataFrame(wdf.isna().sum()).reset_index().rename(columns={'index':'col',0:'nas'}).sort_values('nas',ascending=False)
nasumdf2 = nasumdf2.loc[nasumdf2['nas']>0]
    # review
nasumdf2
        # observation: 14 cols still have na vals
            # decision: drop rows containing na, if these rows make <= 5.5% of len(raw)

Unnamed: 0,col,nas
4,LotFrontage,259
58,GarageYrBlt,81
63,GarageCond,81
57,GarageType,81
59,GarageFinish,81
62,GarageQual,81
35,BsmtFinType2,38
32,BsmtExposure,38
33,BsmtFinType1,37
31,BsmtCond,37


In [353]:
    # storing %5.5 of len(raw) in fivepc
fivepc = (len(raw)/100)*5.5

In [354]:
    # save 5.5% na in df
fivepcdf = nasumdf2.loc[nasumdf2['nas']<fivepc]
    # storing cols with < 5% na vals in list:
colfivepc = fivepcdf['col'].to_list()
    # review: 
# out=
colfivepc = ['BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'MasVnrArea', 'MasVnrType', 'Electrical', 'GarageYrBlt','GarageFinish', 'GarageQual', 'GarageCond', 'GarageType']


In [355]:
# drop fivepc rows and update wdf:
for col in colfivepc:
    l = wdf.loc[wdf[col].isna()].index.to_list()
    wdf = wdf.drop(l)

In [356]:
# review remaining na sums
nasumdf3 = wdf.isna().sum().reset_index().rename(columns={'index':'col',0:'nas'}).sort_values('nas',ascending=False)
nasumdf3 = nasumdf3.loc[nasumdf3['nas']>0]
nasumdf3['col'].to_list()

    # observation: only one col remains : LotFrontage

['LotFrontage']

In [357]:
    # closer look at col LotFrontage
wdf['LotFrontage'].value_counts()

60.0     120
80.0      67
70.0      63
75.0      50
50.0      49
        ... 
150.0      1
144.0      1
160.0      1
46.0       1
141.0      1
Name: LotFrontage, Length: 107, dtype: int64

In [358]:
wdf['LotFrontage'].min()

21.0

In [359]:
wdf['LotFrontage'].max()

313.0

In [360]:
wdf['LotFrontage'].mean()

70.75959780621572

In [361]:
wdf['LotFrontage'].median()

70.0

In [362]:
wdf['LotFrontage'].std()

24.508858570320566

In [363]:
    # review concerned rows
wdf.loc[wdf['LotFrontage'].isna()]

Unnamed: 0,const,id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,month,year,SaleType,SaleCondition,price,date
996,1.0,997,20,RL,,10659,Pave,IR1,Lvl,AllPub,...,0,0,0,0,1,2006,COD,Normal,136500,2006-01-01
370,1.0,371,60,RL,,8121,Pave,IR1,Lvl,AllPub,...,0,0,0,0,1,2006,WD,Normal,172400,2006-01-01
169,1.0,170,20,RL,,16669,Pave,IR1,Lvl,AllPub,...,0,0,0,0,1,2006,WD,Normal,228000,2006-01-01
545,1.0,546,50,RL,,13837,Pave,IR1,Lvl,AllPub,...,0,0,0,0,2,2006,WD,Normal,229000,2006-02-01
1033,1.0,1034,20,RL,,8125,Pave,Reg,Lvl,AllPub,...,0,0,0,0,2,2006,WD,Normal,230000,2006-02-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1358,1.0,1359,160,FV,,2117,Pave,Reg,Lvl,AllPub,...,0,0,0,0,6,2010,WD,Normal,177500,2010-06-01
470,1.0,471,120,RL,,6820,Pave,IR1,Lvl,AllPub,...,0,140,0,0,6,2010,WD,Normal,212000,2010-06-01
412,1.0,413,20,FV,,4403,Pave,IR2,Lvl,AllPub,...,0,0,0,0,6,2010,New,Partial,222000,2010-06-01
939,1.0,940,70,RL,,24090,Pave,Reg,Lvl,AllPub,...,0,0,0,0,6,2010,COD,Normal,244400,2010-06-01


In [364]:
###################################################################################################################
### NOTES:
#
# observation: col 'LotArea' has values where 'LotFrontage' is na
    # decision: try to figure out if there is some correlation between these two
###################################################################################################################

    # get these cols and save them to lotdf:
    
lotdf = wdf.copy()
lotdf = lotdf.get(['LotFrontage','LotArea']).dropna().sort_values('LotArea')

In [365]:
###################################################################################################################
### NOTES:
# 
# consideration:
#        if all lots would be squares, the lenght the frontages would be square-roots of each areas
# 
# conclusion:
#        assign a col with frontage-lenghts if areas were squares to review deviations
###################################################################################################################

In [366]:
    # assign col with lenght if area is suare
lotdf = lotdf.assign(ifsqare= np.sqrt(lotdf['LotArea']))

    # assign col with difference beween real frontage and squarefrontage
lotdf = lotdf.assign(diff=lotdf['ifsqare'] - lotdf['LotFrontage']).sort_values('diff')

# review key values of difference:

In [367]:
lotdf['diff'].std()

22.437757978655192

In [368]:
lotdf['diff'].max()

313.9450398484717

In [369]:
lotdf['diff'].min()

-146.7171085168411

In [370]:
lotdf['diff'].mean()

26.634305902799504

In [371]:
lotdf['diff'].median()

24.810547537889594

In [372]:
# closer look at highest diff
lotdf.tail(10)

Unnamed: 0,LotFrontage,LotArea,ifsqare,diff
1409,46.0,20544,143.331783,97.331783
692,42.0,26178,161.796168,119.796168
271,73.0,39104,197.747314,124.747314
1184,50.0,35133,187.43799,137.43799
848,75.0,45600,213.541565,138.541565
53,68.0,50271,224.211953,156.211953
661,52.0,46589,215.844852,163.844852
769,47.0,53504,231.309317,184.309317
451,62.0,70761,266.009398,204.009398
313,150.0,215245,463.94504,313.94504


In [373]:
# closer look at lowest diff
lotdf.head(10)

Unnamed: 0,LotFrontage,LotArea,ifsqare,diff
934,313.0,27650,166.282891,-146.717109
1127,182.0,14572,120.714539,-61.285461
1298,313.0,63887,252.758778,-60.241222
231,174.0,15138,123.03658,-50.96342
1211,152.0,12134,110.154437,-41.845563
909,149.0,12589,112.200713,-36.799287
1182,160.0,15623,124.992,-35.008
276,129.0,9196,95.895777,-33.104223
966,130.0,9600,97.97959,-32.02041
429,130.0,11457,107.037377,-22.962623


In [374]:
###################################################################################################################
### NOTES:
# 
# observation:
#       - variaty of differences too big to make a conclusion via area shape
# decision:
#       - confirm via ols before filling na in wdf with median
#
###################################################################################################################

In [375]:
# OLS LotArea, LotFrontage

    # add constant
lotdf = sm.add_constant(lotdf)
    
    # ols
results = sm.OLS(lotdf["LotFrontage"], 
                  lotdf[["LotArea", "const"]]).fit()
    # summary
results.summary()

0,1,2,3
Dep. Variable:,LotFrontage,R-squared:,0.176
Model:,OLS,Adj. R-squared:,0.175
Method:,Least Squares,F-statistic:,233.5
Date:,"Sun, 13 Sep 2020",Prob (F-statistic):,6.43e-48
Time:,16:03:49,Log-Likelihood:,-4945.6
No. Observations:,1094,AIC:,9895.0
Df Residuals:,1092,BIC:,9905.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
LotArea,0.0013,8.2e-05,15.281,0.000,0.001,0.001
const,58.0678,1.069,54.323,0.000,55.970,60.165

0,1,2,3
Omnibus:,436.477,Durbin-Watson:,0.642
Prob(Omnibus):,0.0,Jarque-Bera (JB):,12811.7
Skew:,1.214,Prob(JB):,0.0
Kurtosis:,19.588,Cond. No.,20700.0


In [376]:
###################################################################################################################
### SUMMARY Frontage/Area:
#
# LotFronatge vals:
#                  - std: 24.508858570320566
#                  - mean: 70.75959780621572
#                  - median: 70.0
#                  - min: 21.0
#                  - max: 313.0
#
# OLS Frontage/Area vals:
#                  - coef: 0.0013
#                  - Adj. R-squared: 0.175
#                  - t: 15.281
#
###################################################################################################################

In [377]:
###################################################################################################################
### NOTES:
#
# conclusion: OLS confirmed that compared area vals not sufficient to calculate missing frontage vals
#
# decision: missing val will be filled with mean
#
###################################################################################################################

# store mean 
fmean = wdf['LotFrontage'].mean()

# fillna and update
wdf['LotFrontage'] = wdf['LotFrontage'].fillna(fmean)

###################################################################################################################
###################################################################################################################

In [379]:
###################################################################################################################
# confirm: no more remaining na vals in wdf

confirm = pd.DataFrame(wdf.isna().sum())
len(confirm.loc[confirm[0]>1])==0

###################################################################################################################

True

# Dtypes

In [394]:
    # make a temp copy of wdf to be save
temp = wdf.copy()

    # review dtypes and store them to df
temptypes = pd.DataFrame(temp.dtypes)
temptypes = temptypes.loc[temptypes[0]==object]
    # review:
temptypes
    

Unnamed: 0,0
MSZoning,object
Street,object
LotShape,object
LandContour,object
Utilities,object
LotConfig,object
LandSlope,object
Neighborhood,object
Condition1,object
Condition2,object


In [395]:
# save cols to list
listobjcols = temptypes.index.to_list()

In [399]:
# save len of value counts for object cols
valuecounts = [len(temp[c].value_counts()) for c in listobjcols]

In [403]:
# v(alue)c(ount) df
data = {'col':listobjcols, 'valcount':valuecounts}
vcdf = pd.DataFrame(data)

In [407]:
# get binary obj vals and save them to binar
binar = vcdf.loc[vcdf['valcount']==2]
# review
binar

Unnamed: 0,col,valcount
1,Street,2
4,Utilities,2
27,CentralAir,2


In [409]:
# get not-binary obj vals and save them to nobinar
nobinar = vcdf.loc[vcdf['valcount']!=2]
# review
nobinar

Unnamed: 0,col,valcount
0,MSZoning,5
2,LotShape,4
3,LandContour,4
5,LotConfig,5
6,LandSlope,3
7,Neighborhood,25
8,Condition1,9
9,Condition2,8
10,BldgType,5
11,HouseStyle,8


In [454]:
####################################################################################################################
### NOTES:
# 
# observation: some object values have binary values and can be translated into 0/1 vals
#
# decision: review binary object vals and replace them with numeric vals
#
####################################################################################################################


In [455]:
####################################################################################################################
# binary object cols:

# storing binar object val cols
binarlist = binar['col'].to_list()
# review
binarlist

['Street', 'Utilities', 'CentralAir']

In [456]:
# make a new wdf copy = c(leaned)wdf
cwdf = wdf.copy()

In [457]:
cwdf["CentralAir"].replace({"Y": 1, "N": 0}, inplace=True)

In [458]:
cwdf["Utilities"].replace({"AllPub": 1, "NoSeWa": 0}, inplace=True)

In [459]:
cwdf['Street'].replace({"Pave": 1, "Grvl": 0}, inplace=True)

In [545]:
####################################################################################################################
# nobinary object cols:
temp = cwdf.copy()

In [546]:
nobincols = nobinar['col'].to_list()

In [547]:
#review:
pd.DataFrame([temp[x] for x in nobincols])

Unnamed: 0,1404,996,411,1040,370,810,302,169,141,664,...,869,324,939,378,769,1144,1421,419,1160,66
MSZoning,RL,RL,RL,RL,RL,RL,RL,RL,RL,RL,...,RL,RL,RL,RL,RL,RL,RL,RL,RL,RL
LotShape,Reg,IR1,Reg,Reg,IR1,Reg,IR1,IR1,Reg,IR2,...,Reg,Reg,Reg,Reg,IR2,Reg,Reg,Reg,Reg,Reg
LandContour,Lvl,Lvl,Bnk,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,...,Lvl,Lvl,Lvl,Lvl,HLS,Lvl,Lvl,Lvl,Lvl,Lvl
LotConfig,Corner,Inside,Inside,Corner,Inside,Inside,Corner,Corner,Inside,CulDSac,...,Inside,Corner,Inside,Corner,CulDSac,Inside,Inside,Inside,Inside,Inside
LandSlope,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,...,Gtl,Gtl,Gtl,Gtl,Mod,Gtl,Gtl,Gtl,Gtl,Gtl
Neighborhood,OldTown,NAmes,Gilbert,Sawyer,Gilbert,NWAmes,CollgCr,Timber,CollgCr,Somerst,...,SawyerW,NAmes,ClearCr,StoneBr,StoneBr,Edwards,NPkVill,NAmes,NPkVill,NAmes
Condition1,Artery,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm,RRAn,...,Norm,PosN,Norm,Norm,Norm,Norm,Norm,Norm,Norm,PosA
Condition2,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm,...,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm
BldgType,1,1,5,1,1,1,1,1,1,1,...,1,1,1,1,1,5,2,1,3,1
HouseStyle,1.5Fin,1Story,1Story,1Story,2Story,1Story,1Story,1Story,1Story,1Story,...,2Story,SLvl,2Story,1Story,2Story,1.5Fin,1Story,1Story,2Story,1Story


In [513]:
def replacer(col):
    coldf = pd.DataFrame(temp[col].value_counts()).reset_index().rename(columns={'index':'oldval'}).reset_index()
    coldf['rank'] = coldf['index']+1
    newvals = coldf['rank'].to_list()
    oldvals = coldf['oldval'].to_list()
    dictreplace = dict(zip(oldv,newv))
    return dictreplace

In [543]:
replacerlist = []

for col in nobincols:
    c = str(col)
    d = replacer(col)
    replacerlist.append(d)


In [620]:
# do through val replacements manually and print each vals to check

# copy cwdf to ccdf:
ccdf = cwdf.copy()

In [621]:
i = 0
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

1 MSZoning {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [622]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

2 LotShape {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [623]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

3 LandContour {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [624]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

4 LotConfig {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [625]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

5 LandSlope {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [626]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

6 Neighborhood {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [627]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

7 Condition1 {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [628]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

8 Condition2 {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [630]:
# val error for i=9

col = 'HouseStyle'
ndf = pd.DataFrame(ccdf[col].value_counts()).reset_index().reset_index().rename(columns={'index':'col'})
ndf['rank'] = ndf['level_0']+1
oldv = ndf['col'].to_list()
nval = ndf['rank'].to_list()
d = dict(zip(oldv,nval))
ccdf[col] = ccdf[col].replace(d)

In [631]:
# continue with i = 10

i = 10
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

11 RoofStyle {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [632]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

12 RoofMatl {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [633]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

13 Exterior1st {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [634]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

14 Exterior2nd {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [635]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

15 MasVnrType {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [636]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

16 ExterQual {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [637]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

17 ExterCond {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [638]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

18 Foundation {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [639]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

19 BsmtQual {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [640]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

20 BsmtCond {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [641]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

21 BsmtExposure {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [642]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

22 BsmtFinType1 {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [643]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

23 BsmtFinType2 {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [644]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

24 Heating {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [645]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

25 HeatingQC {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [646]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

26 Electrical {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [647]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

27 KitchenQual {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [648]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

28 Functional {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [649]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

29 GarageType {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [650]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

30 GarageFinish {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [651]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

31 GarageQual {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [652]:
c = nobincols[i] 
d = replacerlist[i]
ccdf[c] = ccdf[c].replace(d)
i +=1
print(i,c,d)

32 GarageCond {'1Fam': 1, 'TwnhsE': 2, 'Twnhs': 3, 'Duplex': 4, '2fmCon': 5}


In [654]:
# copy ccdf to findf

cleandf = ccdf.copy()

####################################################################################################################

In [656]:
####################################################################################################################
####################################################################################################################
### EXPORT:
#
# cleandf to csv

cleandf.to_csv('cleandata.csv')

####################################################################################################################
####################################################################################################################