In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse
import warnings
import time
import sys
import os
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
from sklearn.metrics import mean_squared_error

In [2]:
def get_drop_speed(V, t):
    try:
        return V / t
    except:
        return 0

def id_features(s):
    return float(s.split('_')[1])

def minus_temperature(t1, t2):
    return t2 - t1


def TimeTransHours(StartTime, EndTime, DefaultHours):
    try:
        StartHour, StartMinute, _ = StartTime.split(':')
        EndHour, EndMinute, _ = EndTime.split(':')
        Hours = (int(EndHour) * 60 + int(EndMinute) - int(StartMinute) - int(
            StartHour) * 60) / 60.0
        Hours = Hours % 24
    except:
        Hours = DefaultHours
    return Hours


def GetTimeFeatures(se, DefaultHours):
    try:
        sh, sm, eh, em = re.findall(r"\d+\.?\d*", se)
    except:
        return DefaultHours

    try:
        if int(sh) > int(eh):
            tm = (int(eh) * 3600 + int(em) * 60 - int(sm) * 60 - int(sh) * 3600) / 3600.0 + 24
        else:
            tm = (int(eh) * 3600 + int(em) * 60 - int(sm) * 60 - int(sh) * 3600) / 3600.0
    except:
        return DefaultHours

    return tm

In [3]:
def FeatureEngineering(train):

    # 删除类别唯一的特征
    train.drop(['A1', 'A4', 'A8', 'A13', 'A16', 'A18', 'A23', 'B3'], axis=1, inplace=True)

    # 填充空缺特征
    train.fillna(0, inplace=True)

    # 转换时间特征
    train['A20'] = train.apply(lambda df: GetTimeFeatures(df['A20'], DefaultHours=0.0), axis=1)
    # train['A24'] = train.apply(lambda df: GetTimeFeatures(df['A24'], DefaultHours=0.0), axis=1)
    train['A28'] = train.apply(lambda df: GetTimeFeatures(df['A28'], DefaultHours=0.0), axis=1)
    train['B4'] = train.apply(lambda df: GetTimeFeatures(df['B4'], DefaultHours=0.0), axis=1)
    train['B9'] = train.apply(lambda df: GetTimeFeatures(df['B9'], DefaultHours=0.001), axis=1)
    train['B10'] = train.apply(lambda df: GetTimeFeatures(df['B10'], DefaultHours=0.001), axis=1)
    train['B11'] = train.apply(lambda df: GetTimeFeatures(df['B11'], DefaultHours=0.001), axis=1)

    # 添加时长特征
    train['A9'] = train.apply(lambda df: TimeTransHours(df['A9'], '00:00:00', DefaultHours=12.0), axis=1)

    # 删除无用的时间特征
    train.drop(['A5', 'A7', 'A24', 'A11', 'A14', 'A26', 'B5', 'B7'], axis=1, inplace=True)

    # 添加温度变化特征
    train['delta_temprature_1'] = train.apply(lambda df: minus_temperature(df['A10'], df['A15']), axis=1)
    train['delta_temprature_2'] = train.apply(lambda df: minus_temperature(df['A15'], df['A17']), axis=1)
    train['delta_temprature_4'] = train.apply(lambda df: minus_temperature(df['A25'], df['A27']), axis=1)
    train['delta_temprature_5'] = train.apply(lambda df: minus_temperature(df['A27'], df['B6']), axis=1)

    # 删除无用的温度特征
    train.drop(['A15', 'A17', 'A25'], axis=1, inplace=True)

    # 添加物质的量特征
    train['n_B1_B2'] = train.apply(lambda df: df['B1'] * df['B2'], axis=1)
    train['n_B13_B14'] = train.apply(lambda df: df['B13'] * df['B14'], axis=1)

    # 添加滴加速率
    train['v_B9'] = train.apply(lambda df: df['B12'] / df['B9'], axis=1) # 体积/时长=速率
    train['v_B10'] = train.apply(lambda df: df['B12'] / df['B10'], axis=1)
    train['v_B11'] = train.apply(lambda df: df['B12'] / df['B11'], axis=1)

    # 添加样本id特征
    train['id'] = train.apply(lambda df: id_features(df['样本id']), axis=1)
    train.drop(['样本id'], axis=1, inplace=True)

    return train


In [4]:
def FeaturesUnion(train, test, feature_categories):
    '''
    添加新特征，将收率进行分箱，然后构造每个特征中的类别对应不同收率的均值
    '''
    train['intTarget'] = pd.cut(train['收率'], 5, labels=False)
    train = pd.get_dummies(train, columns=['intTarget'])
    li = ['intTarget_' + str(idx) for idx in range(5)]
    mean_features = []

    target = train['收率']
    train.drop(['收率'], axis=1, inplace=True)

    # for data in [train, test]:
    #     for f in feature_categories:
    #         data[f] = data[f].map(dict(zip(data[f].unique(), range(0, data[f].nunique()))))

    for f1 in feature_categories:
        for f2 in li:
            col_name = f1 + "_" + f2 + '_mean'
            mean_features.append(col_name)
            order_label = train.groupby([f1])[f2].mean()
            # train[col_name] = train[f1].map(order_label)
            for df in [train, test]:
                df[col_name] = df[f1].map(order_label)

    # 保持train和test特征数目一致
    train.drop(li, axis=1, inplace=True)

    return train, test, target, mean_features

In [6]:
train = pd.read_csv('jinnan_round1_train_20181227.csv', encoding = 'gb18030')
test = pd.read_csv('jinnan_round1_testA_20181227.csv', encoding = 'gb18030')
# Feature Engineering
train = FeatureEngineering(train)
test = FeatureEngineering(test)
feature_categories = [name for name in train.columns if name != '收率' and name != 'id']
train, test, target, mean_features = FeaturesUnion(train, test, feature_categories)


    # 分离变量
X_train = train.values
y_train = target.values
X_test = test.values

TypeError: ("unsupported operand type(s) for -: 'int' and 'str'", 'occurred at index 0')