In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import r2_score
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import warnings
import datetime
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']

In [2]:
def parseData(df):
    """
    处理数据
    """
#     '--'转为'未知方式'
    df['rentType'][df['rentType']=='--'] = '未知方式'
    
    # 转换object类型数据
    columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
    for col in columns:
        df[col] = df[col].astype('category')
        
    # 将buildYear列转换为整型数据
#     使用众数1994填充
    df['buildYear'] = np.where(df['buildYear'] == '暂无信息', 1994, df['buildYear'])
    df['buildYear'] = df['buildYear'].astype('int')
    
#     处理缺失值
    # 处理pv和uv的空值,缺失值用平均值代替
    df['pv'].fillna(df['pv'].mean(),inplace=True)
    df['uv'].fillna(df['uv'].mean(),inplace=True)
#     df['pv'] = df['pv'].astype('int')
#     df['uv'] = df['uv'].astype('int')
    
    # 去掉部分特征
    df.drop('ID',axis=1, inplace=True)
    df.drop('city',axis=1,inplace=True)
    
    
    return df


In [3]:
def washData(df_train, df_test):
    """
    清洗数据
    """
    df_train = df_train[(df_train['area']<=700) & (df_train['tradeMoney']<=30000)]
#     df_train = df_train[df_train['tradeMoney']<=30000]
    
    return df_train, df_test

In [4]:
def feature(df):
    """
    特征
    """
    # 将houseType转化为‘房间数room’，‘厅数parlor’，‘卫生间数bathroom’
    def parseRoom(info, index):
        res = int(info[index*2])
        return res
    room='室'
    bathroom='厅'
    parlor='卫'
    df.insert(3,room,None)
    df.insert(4, parlor, None)
    df.insert(5, bathroom, None)
    df[room] = df['houseType'].apply(parseRoom, index=0)
    df[parlor] = df['houseType'].apply(parseRoom, index=1)
    df[bathroom] = df['houseType'].apply(parseRoom, index=2)
    df.drop('houseType', axis=1, inplace=True)
    
    # 房间总数
    df['roomsNum'] = df[room] + df[parlor] + df[bathroom]
    
#     交易月份 eg:2018/10/25
    df['_tradeMonth'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1]))
    df.drop('tradeTime', axis=1, inplace=True)
    
    df['_trafficStationNums'] = df['subwayStationNum'] + df['busStationNum']
    df['_schoolNums'] = df['interSchoolNum'] + df['schoolNum'] + df['privateSchoolNum']
    df['_lifeHouseNums'] = df['gymNum'] + df['parkNum'] + df['bankNum'] + df['shopNum'] + df['mallNum'] + df['superMarketNum']
    
    
    categorical_feats = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'region', 'plate']
    return df, categorical_feats

In [5]:
def getData():
    """
    获取数据
    """
    train = pd.read_csv('G:/compete/ai/city_rent_precent/data_set/train_data.csv')
    test = pd.read_csv('G:/compete/ai/city_rent_precent/data_set/test_a.csv')
    
    train, test = washData(train, test)
    train = parseData(train)
    test = parseData(test)
   
    train, col = feature(train)
    test, col = feature(test)
    
    target = train.pop('tradeMoney')
    features = train.columns
    categorical_feats = col
    
    return train, test, target, features, categorical_feats

In [6]:
def getInitData():
    """
    获取原始数据
    """
    train = pd.read_csv('G:/compete/ai/city_rent_precent/data_set/train_data.csv')
    test = pd.read_csv('G:/compete/ai/city_rent_precent/data_set/test_a.csv')
    return train, test

In [7]:
# 原始数据分析
itrain, itest = getInitData()

In [8]:
# 之后为清洗后数据
train, test, target, features, categorical_feats = getData()

In [9]:
# 数值特征
print(train.columns)
num_features=list()
for i in features:
    if train[i].dtype in ['int64','float64','int32']:
        num_features.append(i)
num_features

Index(['area', 'rentType', '室', '卫', '厅', 'houseFloor', 'totalFloor',
       'houseToward', 'houseDecoration', 'communityName', 'region', 'plate',
       'buildYear', 'saleSecHouseNum', 'subwayStationNum', 'busStationNum',
       'interSchoolNum', 'schoolNum', 'privateSchoolNum', 'hospitalNum',
       'drugStoreNum', 'gymNum', 'bankNum', 'shopNum', 'parkNum', 'mallNum',
       'superMarketNum', 'totalTradeMoney', 'totalTradeArea', 'tradeMeanPrice',
       'tradeSecNum', 'totalNewTradeMoney', 'totalNewTradeArea',
       'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum', 'supplyNewNum',
       'supplyLandNum', 'supplyLandArea', 'tradeLandNum', 'tradeLandArea',
       'landTotalPrice', 'landMeanPrice', 'totalWorkers', 'newWorkers',
       'residentPopulation', 'pv', 'uv', 'lookNum', 'roomsNum', '_tradeMonth',
       '_trafficStationNums', '_schoolNums', '_lifeHouseNums'],
      dtype='object')


['area',
 '室',
 '卫',
 '厅',
 'totalFloor',
 'buildYear',
 'saleSecHouseNum',
 'subwayStationNum',
 'busStationNum',
 'interSchoolNum',
 'schoolNum',
 'privateSchoolNum',
 'hospitalNum',
 'drugStoreNum',
 'gymNum',
 'bankNum',
 'shopNum',
 'parkNum',
 'mallNum',
 'superMarketNum',
 'totalTradeMoney',
 'totalTradeArea',
 'tradeMeanPrice',
 'tradeSecNum',
 'totalNewTradeMoney',
 'totalNewTradeArea',
 'tradeNewMeanPrice',
 'tradeNewNum',
 'remainNewNum',
 'supplyNewNum',
 'supplyLandNum',
 'supplyLandArea',
 'tradeLandNum',
 'tradeLandArea',
 'landTotalPrice',
 'landMeanPrice',
 'totalWorkers',
 'newWorkers',
 'residentPopulation',
 'pv',
 'uv',
 'lookNum',
 'roomsNum',
 '_tradeMonth',
 '_trafficStationNums',
 '_schoolNums',
 '_lifeHouseNums']

In [10]:

from sklearn.feature_selection import SelectKBest
# 回归问题:f_regression,mutual_info_regression 
from sklearn.feature_selection import f_regression
X, y = train[num_features], target
print(X.shape)
X_new = SelectKBest(f_regression, k=20).fit_transform(X, y)
print(X_new.shape)

(41271, 47)
(41271, 20)


In [11]:
params = {
    'num_leaves': 100,
    'min_data_in_leaf': 20,
    'min_child_samples':20,
    'objective': 'regression',
    'learning_rate': 0.01,
    "boosting": "gbdt",
    "feature_fraction": 0.8,
    "bagging_freq": 1,
    "bagging_fraction": 0.85,
    "bagging_seed": 23,
    "metric": 'rmse',
    "lambda_l1": 0.2,
    "nthread": 4,
}

In [12]:
folds = KFold(n_splits=5, shuffle=True, random_state=2333)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 200)
    
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
    
print("CV Score: {:<8.5f}".format(r2_score(target, oof_lgb)))

fold 0
Training until validation scores don't improve for 200 rounds.
[500]	training's rmse: 804.351	valid_1's rmse: 1018.78
[1000]	training's rmse: 664.596	valid_1's rmse: 999.306
[1500]	training's rmse: 589.621	valid_1's rmse: 997.178
Early stopping, best iteration is:
[1318]	training's rmse: 613.081	valid_1's rmse: 996.645
fold 1
Training until validation scores don't improve for 200 rounds.
[500]	training's rmse: 789.32	valid_1's rmse: 1116.61
[1000]	training's rmse: 654.529	valid_1's rmse: 1104.79
Early stopping, best iteration is:
[1125]	training's rmse: 632.933	valid_1's rmse: 1103.53
fold 2
Training until validation scores don't improve for 200 rounds.
[500]	training's rmse: 789.715	valid_1's rmse: 1124.01
[1000]	training's rmse: 652.017	valid_1's rmse: 1105.78
Early stopping, best iteration is:
[1277]	training's rmse: 606.119	valid_1's rmse: 1105.1
fold 3
Training until validation scores don't improve for 200 rounds.
[500]	training's rmse: 797.793	valid_1's rmse: 1069.13
[1000

In [13]:
from sklearn.metrics import r2_score
def online_score(pred):
    print("预测结果最大值：{},预测结果最小值：{}".format(pred.max(),pred.min()))
    # a榜测分
    conmbine1 = pd.read_csv("G:/compete/ai/city_rent_precent/data_set/sub_a_913.csv",engine = "python",header=None)
    score1 = r2_score(list(map(lambda x:int(x),pred)), conmbine1.values)
    print("对比913分数:{}".format(score1))

In [21]:
from xgboost import XGBRegressor
from hyperopt import hp
from hyperopt import fmin
from hyperopt import tpe
from hyperopt import Trials
from sklearn.model_selection import cross_val_score
xgb_params = {'learning_rate': 0.01, 'n_estimators': 10000, 'max_depth': 8, 'seed': 2019,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0,'reg_alpha':0.7,
              'min_child_weight':0,
             }
# "min_child_weight"=3,'reg_alpha': 0.1487,
def XGB(xgb_params=xgb_params):
    
#     gamma 叶子分裂的最小损失，值越大，分裂越保守
# colsample_bytree 特征采样率
# reg_alpha L1
    xgb = XGBRegressor(**xgb_params)
    metric = cross_val_score(xgb,train,target,cv=5,scoring="r2").mean()
    return metric

xgb_space = {
#     "reg_alpha":hp.uniform("reg_alpha",0.1,1.0),#代表样本个数
#          "n_estimators":hp.randint("n_estimators",10), 
#          "learning_rate":hp.randint("learning_rate",6),  
#          "subsample":hp.randint("subsample",4),#[0,1,2,3] -> [0.7,0.8,0.9,1.0]
         "gamma":hp.randint("gamma",5) #
        }
# algo = partial(tpe.suggest,n_startup_jobs=1)
# best = fmin(GBM,space,algo=algo,max_evals=4)#max_evals表示想要训练的最大模型数量，越大越容易找到最优解

# Optimize
best = fmin(fn = XGB, space = xgb_space, algo = tpe.suggest, 
           max_evals = 4, trials = Trials())
print(best)

  0%|                                                                              | 0/4 [00:00<?, ?it/s, best loss: ?]


ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields rentType, houseFloor, houseToward, houseDecoration, communityName, region, plate

In [23]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(train, target, train_size=0.9, random_state=1234)
model=CatBoostRegressor(iterations=900, depth=7, learning_rate=0.01, loss_function='RMSE')
model.fit(X_train, y_train,cat_features=categorical_feats,eval_set=(X_validation, y_validation),plot=True)
pred = model.predict(test)

CatBoostError: Bad value for num_feature[0,9]="XQ04178": Cannot convert 'b'XQ04178'' to float

In [None]:
有些参数没搞明白