In [1]:
"""
__file__
    UserAnalysis.ipynb
__author__
    Xu Xiaoming< xuxiaoming@mobike.com >
"""

import numpy as np
import pandas as pd
import time,datetime
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
###################
## Load raw data ##
###################
df_train = pd.read_csv('../data/stock_train_data_20170910.csv')
df_test = pd.read_csv('../data/stock_test_data_20170910.csv')
print ("Dimension of df_train {}".format(df_train.shape))
print ("Dimension of df_test {}".format(df_test.shape))

Dimension of df_train (321674, 93)
Dimension of df_test (202757, 90)


In [3]:
print ("Basic statistical description:")
df_train.describe()

Basic statistical description:


Unnamed: 0,id,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature82,feature83,feature84,feature85,feature86,feature87,weight,label,group,era
count,321674.0,321674.0,321674.0,321674.0,321674.0,321674.0,321674.0,321674.0,321674.0,321674.0,...,321674.0,321674.0,321674.0,321674.0,321674.0,321674.0,321674.0,321674.0,321674.0,321674.0
mean,160836.5,-0.011686,0.060095,-0.232937,0.023156,0.04478,-0.044353,-0.012175,-0.189911,0.142191,...,0.020911,0.013494,-0.005148,-0.066604,0.045275,-0.092397,4.017835,0.529987,13.133169,10.062529
std,92859.429583,1.005595,0.830666,0.88828,0.997032,0.915609,0.917459,0.346067,0.57763,0.838974,...,0.994632,0.992614,0.955955,0.661531,0.837193,0.685269,3.722541,0.499101,7.210285,5.724583
min,0.0,-1.509611,-5.019445,-8.092439,-1.812636,-1.32035,-1.36024,-33.60758,-1.20172,-5.093346,...,-1.563695,-3.992317,-1.487627,-3.377006,-9.075827,-3.802815,0.0,0.0,1.0,1.0
25%,80418.25,-0.980688,-0.41385,-0.768747,-0.654777,-0.526413,-0.670985,-0.007072,-0.584645,-0.369214,...,-0.648786,-0.631479,-0.6666,-0.432611,-0.344524,-0.50176,1.0,0.0,7.0,5.0
50%,160836.5,0.034485,0.160273,-0.156013,-0.143727,-0.150132,-0.265433,0.016758,-0.317542,0.279495,...,-0.176873,0.08769,-0.216436,-0.240765,0.19919,-0.208628,3.0,1.0,13.0,10.0
75%,241254.75,0.897501,0.562477,0.371126,0.505312,0.364521,0.339597,0.02442,0.054169,0.815006,...,0.462036,0.723823,0.419545,0.236219,0.588676,0.239939,6.0,1.0,18.0,15.0
max,321673.0,1.814713,7.05981,5.609212,140.64794,96.090794,52.296501,25.543277,8.503426,1.363585,...,106.727015,2.625545,66.661489,9.926261,4.638628,8.71073,63.0,1.0,28.0,20.0


In [4]:
#########################
## Data pre-processing ##
#########################
labels = df_train.label
weights = df_train.weight
df_train.drop(labels=['id','weight','label','era'], axis=1, inplace=True)
df_train['group'] = df_train['group'].astype(str)
df_train = pd.get_dummies(df_train, columns=['group'])

labels = np_utils.to_categorical(labels, num_classes=2)

In [5]:
submission = pd.DataFrame()
submission['id'] = df_test.id

In [6]:
df_test.drop(labels=['id'], axis=1, inplace=True)
df_test['group'] = df_test['group'].astype(str)
df_test = pd.get_dummies(df_test, columns=['group'])

In [7]:
#X_train, X_test, y_train, y_test = train_test_split(df_train, labels,test_size = 0.25 ,random_state=0)
#y_train = np_utils.to_categorical(y_train, num_classes=2)
#y_test = np_utils.to_categorical(y_test, num_classes=2)

In [None]:
#################
## Build model ##
#################

# Model
model = Sequential([
    Dense(256, input_dim=116),
    Activation('relu'),
    Dropout(0.2),
    Dense(256),
    Activation('relu'),
    Dropout(0.2),
    Dense(2),
    Activation('softmax'),
])

# Another way to define your optimizer
rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

# We add metrics to get more results you want to see
model.compile(optimizer=rmsprop,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
####################
## Training model ##
####################
early_stopping = EarlyStopping(monitor='val_loss',  min_delta=0, verbose=1, patience=5)
model.fit(df_train.values,labels,callbacks=[early_stopping],validation_split = 0.25,epochs=10, batch_size=100)

Train on 241255 samples, validate on 80419 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [None]:
loss, accuracy = model.evaluate(X_test.values, y_test, verbose=0)
print("Accuracy = {:.2f}".format(accuracy))

In [None]:
#########################
## Combine predictions ##
#########################
submission['proba'] = pd.DataFrame(model.predict(df_test.values)).iloc[:,1]

In [None]:
##################
## Write result ##
##################
from datetime import datetime
submission.to_csv('./submission/sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)

In [None]:
model.predict(df_test.values)