In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ccf-data/ccf_data/test_1.csv
/kaggle/input/ccf-data/ccf_data/train_all.csv
/kaggle/input/ccf-data/ccf_data/train_2.csv
/kaggle/input/ccf-data/ccf_data/test_2.csv
/kaggle/input/ccf-data/ccf_data/w2v/1_total_fee.csv
/kaggle/input/ccf-data/ccf_data/w2v/4_total_fee.csv
/kaggle/input/ccf-data/ccf_data/w2v/2_total_fee.csv
/kaggle/input/ccf-data/ccf_data/w2v/3_total_fee.csv


In [2]:
import os
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.metrics import f1_score

path = "/kaggle/input/ccf-data/ccf_data/"

w2v_path = path + "w2v/"

train = pd.read_csv(path + "train_2.csv")
test = pd.read_csv(path + "test_2.csv")

train_first = pd.read_csv(path + "train_all.csv")
train["data_type"] = 0
test["data_type"] = 0
train_first["data_type"] = 1

data = pd.concat([train, test, train_first], ignore_index=True).fillna(0) # 将测试集中current_service填充0
data["label"] = data.current_service.astype(int)
data = data.replace("\\N", 999)
data["gender"] = data.gender.astype(int)

# 原始类别特征
origin_cate_feature = ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service',
                       'is_promise_low_consume',
                       'many_over_bill', 'net_service']

# 原始数值特征
origin_num_feature = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
                      'age', 'contract_time',
                      'former_complaint_fee', 'former_complaint_num',
                      'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic',
                      'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time']

# 转换为浮点数
for i in origin_num_feature:
    data[i] = data[i].astype(float)

# 数据集和词向量特征合并
w2v_features = []
for col in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
    df = pd.read_csv(w2v_path + "/" + col + ".csv")
    df = df.drop_duplicates([col]) # 去除重复值
    fs = list(df)
    fs.remove(col) # 移除原始特征1_total_fee
    w2v_features += fs
    data = pd.merge(data, df, on=col, how="left") # 合并
count_feature_list = []

# 新增count特征
def feature_count(data, features=[]):
    if len(set(features)) != len(features):
        print("equal feature !!!")
        return data
    new_feature = "count"
    for i in features:
        new_feature += "_" + i.replace("add_", "")
    try:
        del data[new_feature]
    except:
        pass
    # temp包含features和size特征
    temp = data.groupby(features).size().reset_index().rename(columns={0:new_feature})
    # 以features合并temp
    data = data.merge(temp, "left", on=features)
    count_feature_list.append(new_feature)
    return data

# 四个月缴费金额
data = feature_count(data, ['1_total_fee'])
data = feature_count(data, ['2_total_fee'])
data = feature_count(data, ['3_total_fee'])
data = feature_count(data, ['4_total_fee'])

# 历史执行补救费用交费金额
data = feature_count(data, ['former_complaint_fee'])

# 交费金额
data = feature_count(data, ['pay_num'])
# 合约时长
data = feature_count(data, ['contract_time'])
# 上月结转流量
data = feature_count(data, ['last_month_traffic'])
# 在网时长
data = feature_count(data, ['online_time'])

# 套餐类型和合约类型与其他缴费相关
for i in ['service_type', 'contract_type']:
    data = feature_count(data, [i, '1_total_fee'])
    data = feature_count(data, [i, '2_total_fee'])
    data = feature_count(data, [i, '3_total_fee'])
    data = feature_count(data, [i, '4_total_fee'])
    # 历史执行补救费用交费金额	
    data = feature_count(data, [i, 'former_complaint_fee'])
    
    data = feature_count(data, [i, 'pay_num'])
    data = feature_count(data, [i, 'contract_time'])
    data = feature_count(data, [i, 'last_month_traffic'])
    data = feature_count(data, [i, 'online_time'])
    
# 插值特征
diff_feature_list = ['diff_total_fee_1', 'diff_total_fee_2', 'diff_total_fee_3', 'last_month_traffic_rest',
                     'rest_traffic_ratio',
                     'total_fee_mean', 'total_fee_max', 'total_fee_min', 'total_caller_time', 'service2_caller_ratio',
                     'local_caller_ratio',
                     'total_month_traffic', 'month_traffic_ratio', 'last_month_traffic_ratio', 'pay_num_1_total_fee',
                     '1_total_fee_call_fee', '1_total_fee_call2_fee', '1_total_fee_trfc_fee']

# 两个月之间的交费金额差值
data['diff_total_fee_1'] = data['1_total_fee'] - data['2_total_fee']
data['diff_total_fee_2'] = data['2_total_fee'] - data['3_total_fee']
data['diff_total_fee_3'] = data['3_total_fee'] - data['4_total_fee']

# 交费金额和当前总出账金额_月的差值
data['pay_num_1_total_fee'] = data['pay_num'] - data['1_total_fee']

# 流量差值
data['last_month_traffic_rest'] = data['month_traffic'] - data['last_month_traffic']
data['last_month_traffic_rest'][data['last_month_traffic_rest'] < 0] = 0  # 流量差值小于0的变为0
data['rest_traffic_ratio'] = (data['last_month_traffic_rest'] * 15 / 1024) / data['1_total_fee']

total_fee = []
for i in range(1, 5):
    total_fee.append(str(i) + '_total_fee')
    
# 4个月出账金额均值，最大值，最小值
data['total_fee_mean'] = data[total_fee].mean(1)
data['total_fee_max'] = data[total_fee].max(1)
data['total_fee_min'] = data[total_fee].min(1)

# 通话时长
data['total_caller_time'] = data['service2_caller_time'] + data['service1_caller_time']
data['service2_caller_ratio'] = data['service2_caller_time'] / data['total_caller_time']
data['local_caller_ratio'] = data['local_caller_time'] / data['total_caller_time']

# 累计流量
data['total_month_traffic'] = data['local_trafffic_month'] + data['month_traffic']
data['month_traffic_ratio'] = data['month_traffic'] / data['total_month_traffic']
data['last_month_traffic_ratio'] = data['last_month_traffic'] / data['total_month_traffic']

# 出账费用和套餐外通话时长费用
data['1_total_fee_call_fee'] = data['1_total_fee'] - data['service1_caller_time'] * 0.15
data['1_total_fee_call2_fee'] = data['1_total_fee'] - data['service2_caller_time'] * 0.15
data['1_total_fee_trfc_fee'] = data['1_total_fee'] - (
        data['month_traffic'] - 2 * data['last_month_traffic']) * 0.3

# 套餐1，出账金额差值
data.loc[data.service_type == 1, '1_total_fee_trfc_fee'] = None

# 类别特征
cate_feature = origin_cate_feature
# 数值特征
num_feature = origin_num_feature + count_feature_list + diff_feature_list + w2v_features

# 变更数据格式
for i in cate_feature:
    data[i] = data[i].astype('category')
for i in num_feature:
    data[i] = data[i].astype(float)

# 所有特征
feature = cate_feature + num_feature
print(len(feature), feature)

# 删除当前套餐999999的数据
data = data[data.label != 999999]

# 初赛训练集
train_x = data[(data.data_type == 1)][feature]
train_y = data[(data.data_type == 1)].label

# 复赛训练集
test_x = data[(data.data_type == 0) &(data.label != 0)][feature]
test_y = data[(data.data_type == 0) & (data.label != 0)].label

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


110 ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service', 'is_promise_low_consume', 'many_over_bill', 'net_service', '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'age', 'contract_time', 'former_complaint_fee', 'former_complaint_num', 'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic', 'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time', 'count_1_total_fee', 'count_2_total_fee', 'count_3_total_fee', 'count_4_total_fee', 'count_former_complaint_fee', 'count_pay_num', 'count_contract_time', 'count_last_month_traffic', 'count_online_time', 'count_service_type_1_total_fee', 'count_service_type_2_total_fee', 'count_service_type_3_total_fee', 'count_service_type_4_total_fee', 'count_service_type_former_complaint_fee', 'count_service_type_pay_num', 'count_service_type_contract_time', 'count_service_type_last_month_traffic', 'count_service_type_online_time', 'count_contract_type_1_total_f

训练

In [3]:
lgb_model = lgb.LGBMClassifier(
    boosting_type="gbdt", num_leaves=120, reg_alpha=0, reg_lambda=0.,
    max_depth=-1, n_estimators=2500, objective='multiclass', metric="None",
    subsample=0.9, colsample_bytree=0.5, subsample_freq=1,
    learning_rate=0.035, random_state=2018, n_jobs=10
)
lgb_model.fit(train_x, train_y, categorical_feature=cate_feature)
print(lgb_model.best_score_)



defaultdict(<class 'collections.OrderedDict'>, {})


stacking feature

In [4]:
stacking_path = path = "/kaggle/working/stack"
if not os.path.exists(stacking_path):
    print(stacking_path)
    os.makedirs(stacking_path)
    train_proba = lgb_model.predict_proba(test_x[feature]) # 复赛训练集
    test_proba = lgb_model.predict_proba(data[data.label == 0][feature]) # 复赛测试集
    print(len(train_proba), len(test_proba))
    stacking_train = data[(data.data_type == 0) & (data.label != 0)][['user_id']]  # 复赛训练集user_id
    stacking_test = data[data.label == 0][['user_id']]  # 复赛测试集user_id
    # current_service 11类
    for i in range(11):
        stacking_train['stacking_' + str(i)] = train_proba[:, i]
        stacking_test['stacking_' + str(i)] = test_proba[:, i]
    stacking_train.to_csv(stacking_path + '/train.csv', index=False)
    stacking_test.to_csv(stacking_path + '/test.csv', index=False)

score = f1_score(y_true=test_y, y_pred=lgb_model.predict(test_x), average=None)
print(score)

/kaggle/working/stack
374653 160566
[0.81838353 0.94105106 0.94755196 0.99769692 0.99688882 0.96329085
 0.93226959 0.82596785 0.76647419 0.84641275 0.53931993]


In [5]:
print(os.listdir("/kaggle/working"))

['__notebook__.ipynb', 'stack']
