-
Notifications
You must be signed in to change notification settings - Fork 7
/
blending.py
104 lines (90 loc) · 3.87 KB
/
blending.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: blending
Description :
Author : Administrator
date: 2018/5/10 0010
-------------------------------------------------
Change Activity:
2018/5/10 0010:
-------------------------------------------------
"""
__author__ = 'Administrator'
from __future__ import division
import numpy as np
# import load_data
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from config import *
from feature_integrate import *
fi = FeatureIntegrate()
def loadData():
"""Conveninence function to load all data as numpy arrays.
"""
print("Loading data...")
features = ['zan', 'answer', 'perfect',
'perf_ans', 'unperf_ans', 'q_index', 'q_inviteNum', 'q_answerNum',
'q_answerRate', 'q_unanswerRate', 'q_perfectRate', 'q_unperfectRate',
'qlabel_rate', 'u_index', 'u_inviteTimes',
'u_invitelabel', 'u_answerlabel', 'u_answerTimes', 'u_answerRate',
'u_noAnswerTimes', 'u_labelNum', 'common_word',
'common_alpha', 'common_label']
# train = pd.read_csv("data/train_data_features.csv")
# X_train = train[features]
# y_train = train['label']
# val = pd.read_csv("data/val_data_features.csv")
# X_test = val[features]
# y_test = val['label']
# test = pd.read_csv("data/test_features.csv")
#
# # 载入数据
# train_data = pd.read_csv(path_train01)
# train_data = train_data.ix[:15000000, :]
# test_data = pd.read_csv(path_test01)
#
# train = fi.train_feature_integrate(train_data)
# test = fi.test_feature_integrate(test_data)
#
# feature = [x for x in train.columns if x not in ['TERMINALNO', 'Y', 'hour_count_max', 'night_count_max']]
#
# X_train = train[feature]
# X_test = test[feature]
# y_train = train['Y']
#
return X_train, y_train, X_test, y_test, test[features]
if __name__ == '__main__':
np.random.seed(0) # seed to shuffle the train set
X_train, y_train, X_test, y_test, X_submission = loadData()
clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]
print("Creating train and test sets for blending.")
dataset_blend_train = np.zeros((X_test.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))
for j, clf in enumerate(clfs):
print(j, clf)
clf.fit(X_train, y_train)
y_submission = clf.predict_proba(X_test)[:, 1]
dataset_blend_train[:, j] = y_submission
dataset_blend_test[:, j] = clf.predict_proba(X_submission)[:, 1]
print
print("Blending.")
clf = LogisticRegression()
clf.fit(dataset_blend_train, y_test)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
print("Linear stretch of predictions to [0,1]")
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
print("Saving Results.")
tmp = np.vstack([range(1, len(y_submission) + 1), y_submission]).T
np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',
header='MoleculeId,PredictedProbability', comments='')
to_sub = pd.read_csv("data/test_nolabel.txt")
to_sub['label'] = y_submission
to_sub.to_csv('submission_blend.csv', index=False)