In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder

In [13]:
# 读取数据
rent_train = pd.read_csv('../../dataset/RentForecast/train_data.csv')
rent_test = pd.read_csv('../../dataset/RentForecast/test_a.csv')

target_train = rent_train.iloc[:, -1]
# target_test = rent_test.pop('tradeMoney')

In [16]:
# 数据预处理
def ProcessingData(data):
    # 缺失值处理
    data['rentType'][data['rentType'] == '--'] = '未知方式'
    
    # 类别型数据转换
    columns = ['rentType', 'houseType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
    
    for co in columns:
        data[co] = LabelEncoder().fit_transform(data[co])
        
    # 将buildYear转换为整型
    buildYearMode = pd.DataFrame(data[data['buildYear'] != '暂无信息']['buildYear'].mode())
    data.loc[data[data['buildYear'] == '暂无信息'].index, 'buildYear'] = buildYearMode.iloc[0, 0]
    data['buildYear'] = data['buildYear'].astype('int')
    
    # 处理pv和uv的空值，用该特征的均值替换
    data['pv'].fillna(data['pv'].mean(), inplace=True)
    data['uv'].fillna(data['uv'].mean(), inplace=True)
    data['pv'] = data['pv'].astype('int')
    data['uv'] = data['uv'].astype('int')
    
    # 分割交易时间，将交易时间‘2018-1-12'分割为年、月、日
    data['tradeTime'] = pd.to_datetime(data['tradeTime'])
    data['tradeYear'] = data['tradeTime'].dt.year
    data['tradeMonth'] = data['tradeTime'].dt.month
    data['tradeDay'] = data['tradeTime'].dt.day
    
    # 去除作用不大的特征
    data.drop(['ID', 'city', 'tradeTime'], axis=1, inplace=True)
    
    return data

In [11]:
# 采用IsolationForest处理tradeMoney特征的异常值
from sklearn.ensemble import IsolationForest
def IF_drop(data):
    IForest = IsolationForest(contamination=0.01)
    IForest.fit(data['tradeMoney'].values.reshape(-1, 1))
    y_pred = IForest.predict(data['tradeMoney'].values.reshape(-1, 1))
    drop_index = data.loc[y_pred==-1].index
    # print(drop_index)
    data.drop(drop_index, inplace=True)
    return data

rent_train = IF_drop(rent_train)

# 从area、tradeMoney、tradeFloor等特征处理异常值
def dropData(data):
    data = data[data['area'] <= 200]
    data = data[(data['tradeMoney'] <= 16000) & (data['tradeMoney'] >= 700)]
    data.drop(data[(data['totalFloor'] == 0)].index, inplace=True)
    return data

rent_train = dropData(rent_train)

In [18]:
rent_train = ProcessingData(rent_train)
rent_test = ProcessingData(rent_test)
rent_train.head()

Unnamed: 0,area,rentType,houseType,houseFloor,totalFloor,houseToward,houseDecoration,communityName,region,plate,...,totalWorkers,newWorkers,residentPopulation,pv,uv,lookNum,tradeMoney,tradeYear,tradeMonth,tradeDay
0,68.06,2,12,1,16,6,0,50,0,63,...,28248,614,111546,1124,284,0,2000.0,2018,11,28
1,125.55,2,28,0,14,6,2,129,1,48,...,14823,148,157552,701,22,1,2000.0,2018,12,16
2,132.0,2,28,1,32,6,0,178,1,49,...,77645,520,131744,57,20,1,16000.0,2018,12,22
3,57.0,2,4,0,17,6,3,312,1,50,...,8750,1665,253337,888,279,9,1600.0,2018,12,21
4,129.0,2,29,1,2,6,1,1256,2,43,...,800,117,125309,2038,480,0,2900.0,2018,11,18


In [20]:
# 相关系数法特征选择
from sklearn.feature_selection import SelectKBest

print(rent_train.shape)

sk = SelectKBest(k=all)
new_train = sk.fit_transform(rent_train, target_train)
print(new_train.shape)

(41440, 51)


TypeError: '<=' not supported between instances of 'int' and 'builtin_function_or_method'

In [17]:
list_a = [1, 2, 3, 4, 5, 10]
num = list_a.pop(5)

In [18]:
num

10