In [1]:
import pandas as pd
import numpy as np
import pdpipe as pdp

In [2]:
df = pd.read_csv("USA_Housing.csv")

In [3]:
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [4]:
import pandas_profiling

In [5]:
df.profile_report()



In [6]:
df.columns

Index(['Avg._Area_Income', 'Avg._Area_House_Age', 'Avg._Area_Number_of_Rooms',
       'Avg._Area_Number_of_Bedrooms', 'Area_Population', 'Price', 'Address'],
      dtype='object')

In [7]:
def size(n):
    if n<=4:
        return 'Small'
    elif 4<n<=6:
        return 'Medium'
    else:
        return 'Big'

In [8]:
df["House Size"] = df['Avg._Area_Number_of_Rooms'].apply(size)

In [9]:
df.head()

Unnamed: 0,Avg._Area_Income,Avg._Area_House_Age,Avg._Area_Number_of_Rooms,Avg._Area_Number_of_Bedrooms,Area_Population,Price,Address,House Size
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",Big
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA...",Big
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",Big
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820,Medium
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386,Big


In [10]:
drop_age = pdp.ColDrop('Avg._Area_House_Age')

In [11]:
df2 = drop_age(df)

In [12]:
df2.head()

Unnamed: 0,Avg._Area_Income,Avg._Area_Number_of_Rooms,Avg._Area_Number_of_Bedrooms,Area_Population,Price,Address,House Size
0,79545.458574,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",Big
1,79248.642455,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA...",Big
2,61287.067179,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",Big
3,63345.240046,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820,Medium
4,59982.197226,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386,Big


Chaining stages together

In [13]:
pipeline = pdp.ColDrop('Avg._Area_House_Age')
pipeline+= pdp.OneHotEncode('House Size')

In [14]:
pipeline(df)

Unnamed: 0,Avg._Area_Income,Avg._Area_Number_of_Rooms,Avg._Area_Number_of_Bedrooms,Area_Population,Price,Address,House Size_Medium,House Size_Small
0,79545.458574,7.009188,4.09,23086.800503,1.059034e+06,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",0,0
1,79248.642455,6.730821,3.09,40173.072174,1.505891e+06,"188 Johnson Views Suite 079\nLake Kathleen, CA...",0,0
2,61287.067179,8.512727,5.13,36882.159400,1.058988e+06,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",0,0
3,63345.240046,5.586729,3.26,34310.242831,1.260617e+06,USS Barnett\nFPO AP 44820,1,0
4,59982.197226,7.839388,4.23,26354.109472,6.309435e+05,USNS Raymond\nFPO AE 09386,0,0
5,80175.754159,6.104512,4.04,26748.428425,1.068138e+06,"06039 Jennifer Islands Apt. 443\nTracyport, KS...",0,0
6,64698.463428,8.147760,3.41,60828.249085,1.502056e+06,"4759 Daniel Shoals Suite 442\nNguyenburgh, CO ...",0,0
7,78394.339278,6.620478,2.42,36516.358972,1.573937e+06,"972 Joyce Viaduct\nLake William, TN 17778-6483",0,0
8,59927.660813,6.393121,2.30,29387.396003,7.988695e+05,USS Gilbert\nFPO AA 20957,0,0
9,81885.927184,8.167688,6.10,40149.965749,1.545155e+06,Unit 9446 Box 0958\nDPO AE 97025,0,0


In [15]:
df3 = pipeline(df)

In [16]:
def price_tag(x):
    if x>250000:
        return 'keep'
    else:
        return 'drop'

In [17]:
pipeline+=pdp.ApplyByCols('Price',price_tag,'Price_tag',drop=False)
pipeline+=pdp.ValDrop(['drop'],'Price_tag')
pipeline+= pdp.ColDrop('Price_tag')

In [18]:
df4 = pipeline(df)

In [19]:
df4.head()

Unnamed: 0,Avg._Area_Income,Avg._Area_Number_of_Rooms,Avg._Area_Number_of_Bedrooms,Area_Population,Price,Address,House Size_Medium,House Size_Small
0,79545.458574,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",0,0
1,79248.642455,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA...",0,0
2,61287.067179,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",0,0
3,63345.240046,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820,1,0
4,59982.197226,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386,0,0


In [20]:
pipeline_scale = pdp.Scale('StandardScaler',exclude_columns=['House Size_Medium','House Size_Small'])

In [21]:
df5 = pipeline_scale(df4)

In [22]:
df5.head()

Unnamed: 0,Avg._Area_Income,Avg._Area_Number_of_Rooms,Avg._Area_Number_of_Bedrooms,Area_Population,Price,Address,House Size_Medium,House Size_Small
0,1.028113,0.019595,0.087245,-1.32281,-0.500532,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",0,0
1,1.000175,-0.257485,-0.722671,0.401923,0.775998,"188 Johnson Views Suite 079\nLake Kathleen, CA...",0,0
2,-0.690443,1.516179,0.929559,0.06973,-0.500662,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",0,0
3,-0.496719,-1.396285,-0.584986,-0.189886,0.075327,USS Barnett\nFPO AP 44820,1,0
4,-0.813263,0.845954,0.200634,-0.992999,-1.723449,USNS Raymond\nFPO AE 09386,0,0


Using NLTK to extract state

In [23]:
def extract_state(token):
    return str(token[-2])

In [24]:
pipeline_tokenize = pdp.TokenizeWords('Address')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harrisonmiller/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
