forked from joleo/pingan_behavior_predicting_driving_risk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main2.py
112 lines (93 loc) · 3.35 KB
/
main2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: main2
Description :
Author : Administrator
date: 2018/5/21 0021
-------------------------------------------------
Change Activity:
2018/5/21 0021:
-------------------------------------------------
"""
__author__ = 'joleo'
import operator
import xgboost as xgb
from feature_integrate2 import *
# import warnings
from config import *
# warnings.filterwarnings('ignore')
fi = FeatureIntegrate()
start = time.time()
# 优化数据类型
train_dtypes = {'TERMINALNO': 'int32',
'TIME': 'int32',
'TRIP_ID': 'int16',
'LONGITUDE': 'float32',
'LATITUDE': 'float32',
'DIRECTION': 'int16',
'HEIGHT': 'float32',
'SPEED': 'float32',
'CALLSTATE': 'int8',
'Y': 'float32'}
test_dtypes = {'TERMINALNO': 'int32',
'TIME': 'int32',
'TRIP_ID': 'int16',
'LONGITUDE': 'float32',
'LATITUDE': 'float32',
'DIRECTION': 'int16',
'HEIGHT': 'float32',
'SPEED': 'float32',
'CALLSTATE': 'int8'}
# 载入数据
train_data = pd.read_csv(path_train01, dtype=train_dtypes)
# 23,734,760
train_data = train_data.ix[:15000000, :]
test_data = pd.read_csv(path_test01, dtype=test_dtypes)
train = fi.train_feature_integrate(train_data)
test = fi.test_feature_integrate(test_data)
feature = [x for x in train.columns if x not in ['TERMINALNO','Y','hour_count_max','night_count_max'
,'user_direction_std','call_unknow_state_per','user_call_num_per'
, 'user_lon_std', 'user_lon_mean'
]]
print(feature)
print(train[feature].shape)
# print(train[feature].info())
param = {
"objective": 'reg:linear',
"eval_metric":'rmse',
"seed":27,
"booster": "gbtree",
"min_child_weight":6,
"gamma":0.1,
# 'lambda':3,
"max_depth": 5,
"eta": 0.009,
"silent": 1,
"subsample":0.65,
"colsample_bytree":0.35,
"scale_pos_weight":0.9005
}
# ss_x = preprocessing.StandardScaler()
# train_x_disorder = ss_x.fit_transform(train[feature].fillna(-1))
# df_test = ss_x.transform(test[feature].fillna(-1))
train_y = train_data.groupby(['TERMINALNO'])['Y'].first()
df_train = xgb.DMatrix(train[feature].fillna(-1), train_y)
# df_train = xgb.DMatrix(train[feature].fillna(-1), train['Y'])
df_test = xgb.DMatrix(test[feature].fillna(-1))
model = xgb.train(param,df_train, num_boost_round=800)
y_pred = model.predict(df_test)
# feature importance
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
print(list(df['feature']), list(df['fscore']))
# summission
id = test_data.groupby(['TERMINALNO'],as_index=False)['TRIP_ID'].first()
result = pd.DataFrame(id['TERMINALNO'])
result['Pred'] = y_pred
result = result.rename(columns={'TERMINALNO':'Id'})
result.loc[:, 'Pred'] = result['Pred']
result[['Id','Pred']].to_csv('model/result.csv',header=True,index=False)
print("cost time: " + str(time.time() - start))