-
Notifications
You must be signed in to change notification settings - Fork 0
/
randomCascade.py
266 lines (169 loc) · 7.6 KB
/
randomCascade.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
########Tradeshift Classification Challenge##############
#
# Code from Dmitry Dryomov with my comments and some tweaks, along with my tuning
#
# This was originally written in an Ipython notebook
#
##########################################################
# Importations
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.externals import joblib
from sklearn.metrics import roc_auc_score, f1_score, log_loss, make_scorer
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
from datetime import datetime
print "*"*50
print ""
print ""
print "########Tradeshift Classification Challenge##############"
print ""
######## Data Importation and File Management ############
start = datetime.now()
print "Importing data and munging..."
train = pd.read_csv('train.csv', engine = 'c')
# Taking a sample of data for CV
sample_fraction = 1. # Set by hand for now
sample_size = sample_fraction * train.shape[0]
#ratio = 1/sample_fraction #redundant, but makes things easier to read
#train_sample = train[[hash(id) % ratio == 0 for id in train['id']]]
#train_sample.to_csv('train_sample.csv', index = False) # index False to prevent the extra ,
#del train #free some memory
# Now we merge labels with our file
labels = pd.read_csv('trainLabels.csv', engine = 'c')
#train_with_labels = pd.merge(train_sample, labels, on = 'id')
train_with_labels = pd.merge(train, labels, on = 'id')
# cleanup
del labels
#del train_sample # might be able to have this all pre-made before sending to ec2
del train
# loading test data
test = pd.read_csv('test.csv', engine = 'c')
######### Data Wrangling and Feature Encoding ################
# Initializations
X_numerical = []
X_test_numerical = []
vec = DictVectorizer()
names_categorical = []
# Encode yes/no's as numerical
train_with_labels.replace('YES', 1, inplace = True)
train_with_labels.replace('NO', 0, inplace = True)
train_with_labels.replace('nan', np.NaN, inplace = True)
test.replace('YES', 1, inplace = True)
test.replace('NO', 0, inplace = True)
test.replace('nan', np.NaN, inplace = True)
# Encode rest of features
for name in train_with_labels.columns:
if name.startswith('x'): # only selecting features, not labels
# Find the dominant data type in a column
column_type, _ = max(Counter(map(lambda x: str(type(x)), train_with_labels[name])).items(), key = lambda x:x[1])
#
if column_type == str(str):
train_with_labels[name] = map(str, train_with_labels[name])
test[name] = map(str, test[name])
names_categorical.append(name)
#extra
#print name, len(np.unique(train_with_labels[name]))
else:
X_numerical.append(train_with_labels[name].fillna(-999))
X_test_numerical.append(test[name].fillna(-999))
# Creating arrays from the numerical columns and categorical
X_numerical = np.column_stack(X_numerical)
X_test_numerical = np.column_stack(X_test_numerical)
X_sparse = vec.fit_transform(train_with_labels[names_categorical].T.to_dict().values())
X_test_sparse = vec.transform(test[names_categorical].T.to_dict().values())
X_numerical = np.nan_to_num(X_numerical)
X_test_numerical = np.nan_to_num(X_test_numerical)
print "Munging complete... compressing."
print "Time so far:", str(datetime.now() - start)
print "-"*20
# Compression
joblib.dump( (X_numerical, X_sparse, X_test_numerical, X_test_sparse),
'X.dump', compress = 1, )
######### Base Classifier Level #############
log_loss_scorer = make_scorer(log_loss, needs_proba = True)
#some definitions
y_columns = [name for name in train_with_labels.columns if name.startswith('y')]
X_numerical_base, X_numerical_meta, X_sparse_base, X_sparse_meta, y_base, y_meta = train_test_split(X_numerical, X_sparse, train_with_labels[y_columns].values,
test_size = 0.5) # Note these are random splits 50/50
X_meta = []
X_test_meta = []
# training base layer and building meta layer
print "Training base layer, then building meta layer...."
for i in range(y_base.shape[1]): # a prediction for each label
print i
y = y_base[:,i]
if len(np.unique(y)) == 2:
#Random forest on numerical features
rf = RFC(n_estimators = 70, max_features = 15, n_jobs = -1)
rf.fit(X_numerical_base, y)
X_meta.append(rf.predict_proba(X_numerical_meta)[:,1])
X_test_meta.append(rf.predict_proba(X_test_numerical)[:,1])
#SVC on categorical features
svm = LinearSVC()
svm.fit(X_sparse_base, y)
X_meta.append(svm.decision_function(X_sparse_meta))
X_test_meta.append(svm.decision_function(X_test_sparse))
print " -- Total time so far is:", str(datetime.now() - start)
X_meta = np.column_stack(X_meta)
X_test_meta = np.column_stack(X_test_meta)
print " Meta layer built! Starting training..."
print "-" * 40
######## Meta Level Training and Prediction #########
p_test = []
score_tot = []
for i in range(y_base.shape[1]):
y = y_meta[:,i]
constant = Counter(y)
constant = constant[0] < 4 or constant[1] < 4
predicted = None
if constant:
# Best constant //// basically when almost all the labels are the same
constant_pred = np.mean(list(y_base[:,i]) + list(y_meta[:,i]))
predicted = np.ones(X_test_meta.shape[0]) * constant_pred
print "%d is constant like: %f" % (i, constant_pred)
else: #fit a random forest to the meta level
rf = RFC(n_estimators = 100, max_features = 17, n_jobs = -1) # estimators were at 20... maybe go way higher
rf.fit(np.hstack([X_meta, X_numerical_meta]),y)
predicted = rf.predict_proba(np.hstack([X_test_meta, X_test_numerical]))
predicted = predicted[:,1]
#rf = RFC(n_estimators = 30, n_jobs = -1) Do I need this?
scores = cross_val_score(rf, np.hstack([X_meta, X_numerical_meta]),y, cv = 4,
n_jobs = -1, scoring = log_loss_scorer)
print i, 'RF log-loss: %.4f +/- %.4f, mean = %.6f' %(np.mean(scores), np.std(scores),np.mean(predicted))
print " "
p_test.append(predicted)
score_tot.append(np.mean(scores))
######################
# Feature Importance check
######################
importances = rf.feature_importances_
# create table of the most important
indices = np.argsort(importances)[::-1]
print("Feature Ranking:")
for f in range(10):
print("%d. feature %d (%f)" % (f+1, indices[f], importances[indices[f]]))
print " -- Total time so far is:", str(datetime.now() - start)
print " "
#--------------------
p_test = np.column_stack(p_test)
print '-' * 40
print "Overall log loss on CV is: ", np.sum(score_tot)/33
print "Total time so far is:", str(datetime.now() - start)
print '... now saving predictions...'
######## Save Results ######################
import gzip
def save_predictions(name, ids, predictions) :
out = gzip.open(name, 'w')
print >>out, 'id_label,pred'
for id, id_predictions in zip(test['id'], p_test) :
for y_id, pred in enumerate(id_predictions) :
if pred == 0 or pred == 1 :
pred = str(int(pred))
else :
pred = '%.6f' % pred
print >>out, '%d_y%d,%s' % (id, y_id + 1, pred)
save_predictions('quick_start.csv.gz', test['id'].values, p_test)