# feature engineering
特征工程是利用数据领域的相关知识来创建能够使机器学习算法达到最佳性能的特征的过程。简而言之，特征工程就是一个把原始数据转变成特征的过程，这些特征可以很好的描述这些数据，并且利用它们建立的模型在未知数据上的表现性能可以达到最优（或者接近最佳性能）。

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [17]:
def parseData(df):
    """
    预处理数据
    """
    df['rentType'][df['rentType']=='--'] = '未知方式'
    
    # object类型数据转换为category
    df['region'] = [re[5: ] for re in df['region']]
    df['plate'] = [pl[5: ] for pl in df['plate']]
    columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
    for col in columns:
        df[col] = df[col].astype('category')
        
    # buildYear列转换为整型数据，并将缺失值填充为众数
    tmp = df['buildYear'].copy()
    tmp2 = tmp[tmp!='暂无信息'].astype('int')
    tmp[tmp=='暂无信息'] = tmp2.mode().iloc[0]
    df['buildYear'] = tmp
    df['buildYear'] = df['buildYear'].astype('int')
    
    # pv和uv的缺失值填充为均值，并且转换为整型
    df['pv'].fillna(df['pv'].mean(),inplace=True)
    df['uv'].fillna(df['uv'].mean(),inplace=True)
    df['pv'] = df['pv'].astype('int')
    df['uv'] = df['uv'].astype('int')
    
    # 去掉部分特征
    df.drop('city',axis=1,inplace=True)
    
    
    
    return df

In [28]:
def washData(df_train, df_test):
    """
    清洗数据
    """
    # 根据异常值检测得出区间
    df_train = df_train[df_train['area']<=200]
    df_train = df_train[df_train['tradeMoney']<=10000]
    df_train = df_train[df_train['totalFloor']<=50]
    df_train = df_train[df_train['saleSecHouseNum']<=5]
    df_train = df_train[df_train['remainNewNum']<=1500]
    
    df_train.drop('ID', axis=1, inplace=True)
    df_test.drop('ID', axis=1,inplace=True)
    
    def rentType_trans(rt):
        if rt == '未知方式':
            return 0
        elif rt == '整租':
            return 1
        elif rt == '合租':
            return 2
        else:
            return 0

    df['rentType'] = df['rentType'].apply(rentType_trans)
    
    # 楼层高低数值化    
    def houseFloor_trans(hf):
        if hf == '低':
            return 0
        elif hf == '中':
            return 1
        else:
            return 2

    df['houseFloor'] = df['houseFloor'].apply(houseFloor_trans)

    
    # 房屋朝向数值化
    def houseToward_trans(ht):
        if ht in ['南', '南北', '东南', '西南']:
            return 1
        elif ht in ['东', '东西', '西']:
            return 2
        elif ht in ['西北','东北']:
            return 3
        else:
            return 4

    df['houseToward'] = df['houseToward'].apply(houseToward_trans)

    # 装修类型
    def houseDecoration_trans(hd):
        if hd == '毛坯':
            return 1
        elif hd == '简装':
            return 2
        else:
            return 3

    df['houseDecoration'] = df['houseDecoration'].apply(houseDecoration_trans)
    return df_train, df_test

In [29]:
def feature(df):
    """
    特征
    """


    # 根据房型信息抽取出更细的信息
    df['room'] = [int(ht[0]) for ht in df['houseType']]
    df['hall'] = [int(ht[2]) for ht in df['houseType']]
    df['bath'] = [int(ht[-2]) for ht in df['houseType']]
    df['totalRoom'] = df['room'] + df['hall'] + df['bath']
    df['area_per_room'] = df['area'] / df['totalRoom']
    
    
    # rentType缺失值的填充
    df.loc[(df['area'] <= 50) & (df['room'] >= 3) & (df['rentType'] == 0), 'rentType'] = 2
    df.loc[(df['rentType'] == 2) & (df['area'] > 50), 'area'] = df['area'] / (df['room'] + 1)


    df.loc[(df['houseDecoration'] == 1) & (df['rentType'] == 0), 'rentType'] = 1

    
    df['communityName'] = [int(cn[2: ]) for cn in df['communityName']]
    
    
    # 交易月份
    df['trade_month'] = [int(time.split('/')[1]) for time in df['tradeTime']]
    # 交易季节
    df['season'] = [int(np.ceil(month / 3)) for month in df['trade_month']]

    # 大致所在楼层
    df['floor_ratio'] = round(df['totalFloor'] * ((df['houseFloor'] * 2 + 1) / 6))
    df['per_pv'] = df['pv'] / df['uv']
    df['mean_pv'] = (df['pv'] + df['uv']) / 2
    df['max_pv'] = np.max(df[['pv', 'uv']], axis=1)
    df['min_pv'] = np.min(df[['pv', 'uv']], axis=1)
    df['std_pv'] = np.std(df[['pv', 'uv']], axis=1)
    df['worker_ratio'] = df['totalWorkers'] / df['residentPopulation']
    
    df['room'] = [1 if rt == 2 else room for rt, room in np.array(df[['rentType', 'room']])]
    df['totalRoom'] = [1 if rt == 2 else tr for rt, tr in np.array(df[['rentType', 'totalRoom']])]
    
    a = df.groupby(['plate', 'rentType']).mean().reset_index()[['plate', 'rentType', 'tradeMoney']].sort_values('tradeMoney', ascending=False).rename(columns={'tradeMoney': 'plate_mean'})
    b = df.groupby(['region', 'rentType']).mean().reset_index()[['region', 'rentType', 'tradeMoney']].sort_values('tradeMoney', ascending=False).rename(columns={'tradeMoney': 'region_mean'})
    c = df.groupby(['plate', 'rentType']).median().reset_index()[['plate', 'rentType', 'tradeMoney']].sort_values('tradeMoney', ascending=False).rename(columns={'tradeMoney': 'plate_median'})
    d = df.groupby(['region', 'rentType']).median().reset_index()[['region', 'rentType', 'tradeMoney']].sort_values('tradeMoney', ascending=False).rename(columns={'tradeMoney': 'region_median'})

    df = df.merge(a, how='left', on=['plate', 'rentType'])
    df = df.merge(b, how='left', on=['region', 'rentType'])
    df = df.merge(c, how='left', on=['plate', 'rentType'])
    df = df.merge(d, how='left', on=['region', 'rentType'])
    
    a = df.groupby(['region']).size().reset_index().rename(columns={0: 'region_num'})
    df = df.merge(a, how='left', on=['region'])
    a = df.groupby(['plate']).size().reset_index().rename(columns={0: 'plate_num'})
    df = df.merge(a, how='left', on=['plate'])
    a = df.groupby(['communityName']).size().reset_index().rename(columns={0: 'community_num'})
    df = df.merge(a, how='left', on=['communityName'])
    a = df[['region', 'plate']].groupby(['region']).apply(lambda x: x['plate'].nunique()).reset_index().rename(columns={0: 'plate_contain'})
    df = df.merge(a, how='left', on=['region'])
    a = df.groupby('plate').apply(lambda x: x['communityName'].nunique()).reset_index().rename(columns={0: 'community_contain'})
    df = df.merge(a, how='left', on=['plate'])
    
    a = df.groupby('plate').mean().reset_index()[['plate', 'totalFloor']].rename(columns={'totalFloor': 'plate_mean_floor'})
    df = df.merge(a, how='left', on=['plate'])
    a = df.groupby('plate').median().reset_index()[['plate', 'totalFloor']].rename(columns={'totalFloor': 'plate_median_floor'})
    df = df.merge(a, how='left', on=['plate'])
    a = df.groupby('plate').mean().reset_index()[['plate', 'area']].rename(columns={'area': 'plate_mean_area'})
    df = df.merge(a, how='left', on=['plate'])
    a = df.groupby('plate').median().reset_index()[['plate', 'area']].rename(columns={'area': 'plate_median_area'})
    df = df.merge(a, how='left', on=['plate'])
    
    df['room_ratio'] = df['room'] / df['totalRoom']
    df['hall_ratio'] = df['hall'] / df['totalRoom']
    df['bath_ratio'] = df['bath'] / df['totalRoom']
    df['room-bath'] = df['room'] - df['bath']

    df['max_type'] = np.argmax(np.array(df[['room', 'hall', 'bath']]), axis=1)

    df['trade_avg'] = df['totalTradeArea'] / (df['tradeSecNum'] + 1)

    df['originWorkers'] = df['totalWorkers'] - df['newWorkers']

    # df['feature2'] = df['plate_mean'] / df['region_mean']

    df['area_part'] = pd.qcut(df['area'], q=6, labels=[0, 1, 2, 3, 4, 5])

    df['tf_part'] = pd.qcut(df['totalFloor'], q=8, duplicates='drop', labels=[i for i in range(6)])
    
    # plate, room, area, totalRoom

    a = df.groupby(['communityName']).apply(lambda x: x['totalRoom'].tolist()).reset_index()
    a['mean_totalRoom'] = [np.mean(i) for i in a[0]]
    a['median_totalRoom'] = [np.median(i) for i in a[0]]
    a = a.drop(0, axis=1)
    df = df.merge(a, how='left', on=['communityName'])

    a = df.groupby(['communityName']).apply(lambda x: x['room'].tolist()).reset_index()
    a['mean_room'] = [np.mean(i) for i in a[0]]
    a['median_room'] = [np.median(i) for i in a[0]]
    a = a.drop(0, axis=1)
    df = df.merge(a, how='left', on=['communityName'])

    a = df.groupby(['plate', 'area_part']).apply(lambda x: x['totalRoom'].tolist()).reset_index()
    a['plate_mean_totalRoom'] = [np.mean(i) for i in a[0]]
    a['plate_median_totalRoom'] = [np.median(i) for i in a[0]]
    a = a.drop(0, axis=1)
    df = df.merge(a, how='left', on=['plate', 'area_part'])

    a = df.groupby(['plate', 'area_part']).apply(lambda x: x['room'].tolist()).reset_index()
    a['plate_mean_room'] = [np.mean(i) for i in a[0]]
    a['plate_median_room'] = [np.median(i) for i in a[0]]
    a = a.drop(0, axis=1)
    df = df.merge(a, how='left', on=['plate', 'area_part'])
    
    # categorical_feats
    categorical_feats = [ 'region', 'plate']
    return df, categorical_feats

In [30]:
def getData(feature):
    """
    获取数据
    """
    train = pd.read_csv('train_data.csv')
    test = pd.read_csv('test_a.csv')
    
    train = parseData(train)
    test = parseData(test)
    train, test = washData(train, test)
    
    train, col = feature(train)
    test, col = feature(test)
    
    target = train.pop('tradeMoney')
    features = train.columns
    categorical_feats = col
    
    return train, test, target, features, categorical_feats

In [None]:
train, test, target, features, categorical_feats = getData(feature)

# feature selection
## Filter
### 相关系数法
### 卡方检验

In [None]:
from sklearn.feature_selection import SelectKBest,SelectPercentile
from sklearn.feature_selection import chi2

X = train_data.drop(["tradeMoney"],axis=1)
y = train_data["tradeMoney"]

# 去掉字符型特征
for col in X.columns:
    if X[col].dtype.name == "category":
        X = X.drop([col],axis=1)
        
X_new = SelectKBest(chi2, k=43).fit(X, y).get_support(indices = True)

## Wrapper
### 递归特征消除法(RFE)


## Embedded
### 基于惩罚项的特征选择法
### 基于树模型的特征选择法