In [1]:
## 데이터 스케일링과 문자열 처리 파이프라인

##pdpipe에는 Scikit_learn의 스케일러 메소드와 NLTK의 문자열 처리 메소드가 내장되어 있다. 

In [2]:
import pandas as pd
import pdpipe as pdp

df = pd.read_csv('./USA_Housing.csv')

def size(n):
    if n <= 6.3:
        return 'Small'
    elif 6.3 < n <= 7.7:
        return 'Medium'
    else:
        return 'Big'

df['House_size'] = df['Avg. Area Number of Rooms'].apply(size)

pipeline = pdp.ColDrop('Avg. Area Number of Rooms')
pipeline += pdp.OneHotEncode('House_size')
pipeline += pdp.RowDrop({'Price': lambda x: x <= 250000})

df2 = pipeline(df)

print('Number of Rows : ', len(df2), sep='')
df2.head()

Number of Rows : 4990


Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
0,79545.458574,5.682861,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",1,0
1,79248.642455,6.0029,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA...",1,0
2,61287.067179,5.86589,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",0,0
3,63345.240046,7.188236,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820,0,1
4,59982.197226,5.040555,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386,0,0


In [3]:
## 스케일링 기능 
## Standardization Scaling : 평균을 0, 분산을 1
## Min-Max Scaling : 0과 1사이에 위치

In [4]:
pipeline_scaler = pdp.Scale('StandardScaler', exclude_columns=['House_size_Medium', 'House_size_Small'])

df3 = pipeline_scaler(df2)

In [5]:
df3.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
0,1.028113,-0.30013,0.087245,-1.32281,-0.500532,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",0.980157,-0.576579
1,1.000175,0.023142,-0.722671,0.401923,0.775998,"188 Johnson Views Suite 079\nLake Kathleen, CA...",0.980157,-0.576579
2,-0.690443,-0.115252,0.929559,0.06973,-0.500662,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",-1.020245,-0.576579
3,-0.496719,1.220456,-0.584986,-0.189886,0.075327,USS Barnett\nFPO AP 44820,-1.020245,1.734368
4,-0.813263,-0.948927,0.200634,-0.992999,-1.723449,USNS Raymond\nFPO AE 09386,-1.020245,-0.576579


In [6]:
df3.describe()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Bedrooms,Area Population,Price,House_size_Medium,House_size_Small
count,4990.0,4990.0,4990.0,4990.0,4990.0,4990.0,4990.0
mean,-1.242081e-15,1.23124e-16,-4.5348940000000006e-17,-2.937752e-16,-7.796035000000001e-17,1.090199e-16,-2.067818e-16
std,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001
min,-4.783945,-3.36939,-1.605481,-3.635828,-2.802583,-1.020245,-0.5765789
25%,-0.6699448,-0.6587607,-0.6821755,-0.6830938,-0.6712406,-1.020245,-0.5765789
50%,0.01881183,-0.005282162,0.05484873,0.002692235,-0.002957818,0.9801568,-0.5765789
75%,0.6750385,0.6791061,0.4112121,0.6749146,0.6785985,0.9801568,-0.5765789
max,3.6783,3.57486,2.039145,3.374546,3.527482,0.9801568,1.734368


In [7]:
## 문자열 처리 기능 추가
## Address를 ZIP코드나 주(state)를 추출

In [8]:
df.loc[0,'Address']

'208 Michael Ferry Apt. 674\nLaurabury, NE 37010-5101'

In [9]:
def extract_state(token) :
    return str(token[-2])

pipeline_tokenizer = pdp.TokenizeText('Address') ## 토큰화
pipeline_state = pdp.ApplyByCols('Address', extract_state, result_columns='State')
pipeline_state_extract = pipeline_tokenizer + pipeline_state

df4 = pipeline_state_extract(df3)

In [10]:
df4.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Bedrooms,Area Population,Price,State,House_size_Medium,House_size_Small
0,1.028113,-0.30013,0.087245,-1.32281,-0.500532,NE,0.980157,-0.576579
1,1.000175,0.023142,-0.722671,0.401923,0.775998,CA,0.980157,-0.576579
2,-0.690443,-0.115252,0.929559,0.06973,-0.500662,WI,-1.020245,-0.576579
3,-0.496719,1.220456,-0.584986,-0.189886,0.075327,AP,-1.020245,1.734368
4,-0.813263,-0.948927,0.200634,-0.992999,-1.723449,AE,-1.020245,-0.576579
