-
Notifications
You must be signed in to change notification settings - Fork 7
/
cnn_model.py
190 lines (157 loc) · 7.31 KB
/
cnn_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: cnn_model
Description :
Author : joleo
date: 2018/5/9 0009
-------------------------------------------------
Change Activity:
2018/5/9 0009:
-------------------------------------------------
"""
__author__ = 'joleo'
import time
from sklearn import preprocessing
import tensorflow as tf
import numpy as np
from feature_integrate import *
from config import *
fi = FeatureIntegrate()
start = time.time()
start_time = time.time()
# 变厚矩阵
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
# 偏置
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
# 卷积处理 变厚过程
def conv2d(x, W):
# stride [1, x_movement, y_movement, 1] x_movement、y_movement就是步长
# Must have strides[0] = strides[3] = 1 padding='SAME'表示卷积后长宽不变
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def normalize_cols(m):
col_max = m.max(axis=0)
col_min = m.min(axis=0)
return (m - col_min) / (col_max - col_min)
def fit(pre_features_list, act_Y, test_features, start_time):
train_x_disorder = pre_features_list
train_y_disorder = act_Y.reshape(-1, 1)
# train_x_disorder_3 = train_x_disorder[:, 3:6]
# train_x_disorder = np.column_stack([train_x_disorder, train_x_disorder_3]) # 随意给x增加了1列,x变为16列,可以reshape为4*4矩阵了 没啥用,就是凑个正方形
# test_features_3 = test_features[:, 3:6]
# test_features = np.column_stack([test_features, test_features_3]) # 随意给x增加了1列,x变为16列,可以reshape为4*4矩阵了 没啥用,就是凑个正方形
print(test_features.shape)
# 随机挑选
# train_x_disorder, test_x_disorder, train_y_disorder, test_y_disorder = train_test_split(x, y, train_size=0.8, random_state=33)
# # 数据标准化
ss_x = preprocessing.StandardScaler()
train_x_disorder = ss_x.fit_transform(train_x_disorder)
test_features = ss_x.transform(test_features)
# train_x_disorder = normalize_cols(train_x_disorder)
# test_features = normalize_cols(test_features)
# define placeholder for inputs to network
xs = tf.placeholder(tf.float32, [None, 36]) # 原始数据的维度:16
ys = tf.placeholder(tf.float32, [None, 1]) # 输出数据为维度:1
keep_prob = tf.placeholder(tf.float32) # dropout的比例
x_image = tf.reshape(xs, [-1, 6, 6, 1]) # 原始数据16变成二维图片4*4
# 初始化算法模型
## conv1 layer ##第一卷积层
W_conv1 = weight_variable([2, 2, 1, 72]) # patch 2x2, in size 1, out size 32,每个像素变成32个像素,就是变厚的过程
b_conv1 = bias_variable([72])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) # output size 2x2x32,长宽不变,高度为32的三维图像
# h_pool1 = max_pool_2x2(h_conv1) # output size 2x2x32 长宽缩小一倍
## conv2 layer ##第二卷积层
W_conv2 = weight_variable([2, 2, 72, 144]) # patch 2x2, in size 32, out size 64
b_conv2 = bias_variable([144])
h_conv2 = tf.nn.relu(conv2d(h_conv1, W_conv2) + b_conv2) # 输入第一层的处理结果 输出shape 4*4*64
# 池化层
# max_pool2 = tf.nn.max_pool(h_conv2, ksize=[1, 2, 2, 1],
# strides=[1, 2, 2, 1], padding='SAME')
#
# # 转换为1*n层,进行平整化
# final_conv_shape = max_pool2.get_shape().as_list()
# final_shape = final_conv_shape[1] * final_conv_shape[2] * final_conv_shape[3]
# flat_output = tf.reshape(max_pool2, [final_conv_shape[0], final_shape])
## fc1 layer ## full connection 全连接层
W_fc1 = weight_variable([6 * 6 * 144, 648]) # 4x4 ,高度为64的三维图片,然后把它拉成512长的一维数组
b_fc1 = bias_variable([648])
h_pool2_flat = tf.reshape(h_conv2, [-1, 6 * 6 * 144]) # 把4*4,高度为64的三维图片拉成一维数组 降维处理
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) # 把数组中扔掉比例为keep_prob的元素
## fc2 layer ## full connection
W_fc2 = weight_variable([648, 1]) # 512长的一维数组压缩为长度为1的数组
b_fc2 = bias_variable([1]) # 偏置
# 最后的计算结果
prediction = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
# prediction = tf.nn.relu(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
# 计算 predition与y 差距 所用方法很简单就是用 suare()平方,sum()求和,mean()平均值
cross_entropy = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction), reduction_indices=[1]))
# 0.01学习效率,minimize(loss)减小loss误差
train_step = tf.train.AdamOptimizer(0.0005).minimize(cross_entropy)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# 训练200次
for i in range(3000):
sess.run(train_step, feed_dict={xs: train_x_disorder, ys: train_y_disorder, keep_prob: 0.7})
loss = sess.run(cross_entropy,feed_dict={xs: train_x_disorder, ys: train_y_disorder, keep_prob: 0.7})
print(i, 'the loss =', loss) # 输出loss值
if (time.time() - start_time) > 20000:
break
if loss < 4:
break
prediction_value = sess.run(prediction, feed_dict={xs: test_features, keep_prob: 0.7})
return prediction_value
# 为了后面复现结果,为numpy和tensorflow设置随机种子
seed = 1514
np.random.seed(seed)
tf.set_random_seed(seed)
# 载入数据
train_data = pd.read_csv(path_train01)
train_data = train_data.ix[:15000000, :]
test_data = pd.read_csv(path_test01)
train = fi.train_feature_integrate(train_data)
test = fi.test_feature_integrate(test_data)
# 36个特征
# feature = [x for x in train.columns if x not in ['TERMINALNO','Y','hour_count_max','night_count_max',
# 'user_speed_skew', 'user_height_skew', 'direction_skew',
# ' hour_2_per','user_distance']]
feature = [x for x in train.columns if x not in ['TERMINALNO','Y','hour_count_max'
,'hour_2_per','night_count_max','user_distance']]
print(feature)
print(train[feature].shape) # 33
train_x = train[feature].fillna(-1)
train_y = train['Y']
test_x = test[feature].fillna(-1)
train_x = train_x.values
train_y = train_y.values
test_x = test_x.values
prediction_value = fit(train_x, train_y, test_x, start_time)
# pre_result = {
# "Id": [],
# "Pred": []
# }
pred = []
result = pd.DataFrame(test['TERMINALNO'])
user_id = list(set(result['TERMINALNO'].tolist()))
# for i in range(len(user_id)):
# user = user_id[i]
# pre_result['Id'].append(user)
# y1 = round(prediction_value[i][0], 9)
# pre_result['Pred'].append(y1)
# df = pd.DataFrame(pre_result, columns=['Id', 'Pred'])
#
# df['Pred'].map(lambda x: abs(x))#map(lambda x: abs(x)/10 if x < 0 else x )
for i in range(len(user_id)):
user = user_id[i]
y1 = round(prediction_value[i][0], 9)
pred.append(y1)
result['Pred'] = pred
result = result.rename(columns={'TERMINALNO':'Id'})
result.loc[:, 'Pred'] = result['Pred'].map(lambda x: abs(x)/10 if x < 0 else x )
# df = pd.DataFrame(pre_result, columns=['Id', 'Pred'])
result.to_csv(path_test_out01,header=True,index=False)
print("cost time: " + str(time.time() - start))