# 房價預測 (Kaggle)
以下用房價預測資料, 觀察群聚編碼的效果

In [11]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
dir_path = './data/'

In [12]:
df = pd.read_csv(os.path.join(dir_path, 'house_train.csv'))
train_Y = np.log1p(df['SalePrice'])
df = df.drop(['Id','SalePrice'], axis =1)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [13]:
# 生活總面積（GrLivArea）對販售條件(SaleCondition)做群聚編碼
# mean, mode median, max
df['SaleCondition'] = df['SaleCondition'].fillna('None')
df1 = df.groupby(by = 'SaleCondition')['GrLivArea'].agg(['mean', ('mode',lambda x: x.mode()[0]),'median','max'])
df1 = df1.reset_index()
df1

Unnamed: 0,SaleCondition,mean,mode,median,max
0,Abnorml,1436.128713,864,1302.0,4476
1,AdjLand,1112.5,980,1143.0,1184
2,Alloca,1701.75,1535,1439.5,3194
3,Family,1480.95,948,1390.5,2526
4,Normal,1492.96828,864,1456.0,4316
5,Partial,1795.696,1456,1646.0,5642


In [14]:
# join群聚編碼的欄位
df2 = pd.merge(df, df1, how = 'left', on='SaleCondition')
df2 = df2.drop(['SaleCondition'],axis=1)
df2.shape

(1460, 82)

In [15]:
# 選取number的欄位call df1
df2 = df2.select_dtypes('number')
print(df2.dtypes.value_counts())
df2.head()

int64      35
float64     5
dtype: int64


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,mean,mode,median,max
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,0,2,2008,1492.96828,864,1456.0,4316
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,0,5,2007,1492.96828,864,1456.0,4316
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,0,9,2008,1492.96828,864,1456.0,4316
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,0,2,2006,1436.128713,864,1302.0,4476
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,0,12,2008,1492.96828,864,1456.0,4316


In [16]:
MMEncoder.fit_transform(df_minus)

array([[0.23529412, 0.21019108, 0.0334198 , ..., 0.        , 0.09090909,
        0.5       ],
       [0.        , 0.25796178, 0.03879502, ..., 0.        , 0.36363636,
        0.25      ],
       [0.23529412, 0.21974522, 0.04650728, ..., 0.        , 0.72727273,
        0.5       ],
       ...,
       [0.29411765, 0.2133758 , 0.03618687, ..., 0.16129032, 0.36363636,
        1.        ],
       [0.        , 0.21974522, 0.03934189, ..., 0.        , 0.27272727,
        1.        ],
       [0.        , 0.24203822, 0.04037019, ..., 0.        , 0.45454545,
        0.5       ]])

In [17]:
df2 = df2.fillna(-1)
MMEncoder = MinMaxScaler()

# 沒有這四個新特徵的 dataframe 稱為 df_minus
df_minus = df2.drop(['mean', 'mode', 'median', 'max'] , axis=1)

# 原始特徵 + 線性迴歸
train_X = MMEncoder.fit_transform(df_minus)
estimator = LinearRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
# 新特徵 + 線性迴歸 : 有些為改善
train_X = MMEncoder.fit_transform(df2)
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())

0.8499683837163878
0.8491312626992871


In [19]:
# gradient
# 原始特徵 + 梯度提升樹
train_X = MMEncoder.fit_transform(df_minus)
estimator = GradientBoostingRegressor()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
# 新特徵 + 梯度提升樹
train_X = MMEncoder.fit_transform(df2)
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())

0.8866231860573169
0.8867207694025551


# 作業 : (Kaggle)鐵達尼生存預測
***
https://www.kaggle.com/c/titanic

# 作業1
* 試著使用鐵達尼號的例子，創立兩種以上的群聚編碼特徵( mean、median、mode、max、min、count 均可 )

In [68]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

data_path = 'data/'
df = pd.read_csv(data_path + 'titanic_train.csv')

train_Y = df['Survived']
df = df.drop(['PassengerId', 'Survived'] , axis=1)
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [69]:
# 取一個類別型欄位, 與一個數值型欄位, 做群聚編碼
df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [70]:
df['Pclass'] = df['Pclass'].fillna('None')
df['Fare'] = df['Fare'].fillna(0)
df1 = df.groupby(by = 'Pclass')['Fare'].agg(['mean',('mode', lambda x: x.mode()[0]),'median', 'max'])
df1 = df1.reset_index()
df1

Unnamed: 0,Pclass,mean,mode,median,max
0,1,84.154687,26.55,60.2875,512.3292
1,2,20.662183,13.0,14.25,73.5
2,3,13.67555,8.05,8.05,69.55


In [71]:
df1 = pd.merge(df, df1, how="left", on = 'Pclass')
df1.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,mean,mode,median,max
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,13.67555,8.05,8.05,69.55
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,84.154687,26.55,60.2875,512.3292
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,13.67555,8.05,8.05,69.55
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,84.154687,26.55,60.2875,512.3292
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,13.67555,8.05,8.05,69.55


In [72]:
df1 = df1.drop('Pclass',axis=1)
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df1.dtypes, df1.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

# 削減文字型欄位, 只剩數值型欄位
df1 = df1[num_features]
df1 = df1.fillna(-1)
MMEncoder = MinMaxScaler()
df1.head()

8 Numeric Features : ['Age', 'SibSp', 'Parch', 'Fare', 'mean', 'mode', 'median', 'max']



Unnamed: 0,Age,SibSp,Parch,Fare,mean,mode,median,max
0,22.0,1,0,7.25,13.67555,8.05,8.05,69.55
1,38.0,1,0,71.2833,84.154687,26.55,60.2875,512.3292
2,26.0,0,0,7.925,13.67555,8.05,8.05,69.55
3,35.0,1,0,53.1,84.154687,26.55,60.2875,512.3292
4,35.0,0,0,8.05,13.67555,8.05,8.05,69.55


# 作業2
* 將上述的新特徵，合併原有的欄位做生存率預估，結果是否有改善?

In [73]:
# 原始特徵 + 邏輯斯迴歸
df_raw = df1.drop(['mean', 'mode', 'median', 'max'], axis=1)
train_X = df_raw
estimator = LogisticRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.6768023967767778

In [74]:
# 新特徵 + 邏輯斯迴歸
train_X = df1
estimator = LogisticRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.6847494509337053