In [21]:
import pandas as pd
import numpy as np
import json

## Mean-Shift Clustering
from sklearn.cluster import MeanShift, estimate_bandwidth

## feature
import datetime

## train-test split
from sklearn.cross_validation import train_test_split

## Data Preprocessing

In [22]:
######## Select 2k points
def select_data_trajecory(df,k):
    ## Sort POLYLINE
    df.POLYLINE = df.POLYLINE.apply(json.loads)
    ## Find records > 10 times
    return df[df.POLYLINE.apply(len)>(2*k)]

## Extract Destination
def extract_dest_row(s):
    ## return new rows
    return pd.Series({'trip_id': s[0], 'longitude': s[4][-1][1], 'latitude': s[4][-1][0]})

## Mean-Shift Clustering

In [23]:
def trajecory_meanshift(df_destination):    
    ## Generate data
    X = df_destination.iloc[:,0:2].values
    
    # Compute clustering with MeanShift

    # The following bandwidth can be automatically detected using
    bandwidth = estimate_bandwidth(X, quantile=0.05, n_samples=1000)

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    print("number of estimated clusters : %d" % n_clusters_)
    
    ## Add Label
    df_destination = pd.concat([df_destination.reset_index().drop(['index'],axis=1), 
                                pd.Series(ms.labels_)], axis=1)
    ## Sort Order
    df_destination.columns = ['latitude','longitude','trip_id','label']
    df_destination = df_destination[['trip_id','latitude','longitude','label']]

    return df_destination
    # ###############################################################################
    # # Plot result
    # import matplotlib.pyplot as plt
    # from itertools import cycle

    # plt.figure(1)
    # plt.clf()

    # colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    # for k, col in zip(range(n_clusters_), colors):
    #     my_members = labels == k
    #     cluster_center = cluster_centers[k]
    #     plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    #     plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
    #              markeredgecolor='k', markersize=14)
    # plt.title('Estimated number of clusters: %d' % n_clusters_)
    # plt.show()


## Feature Engineering

In [24]:
######## Categorical Feature Function
def feature_sort_datetime(df):    
    #### FDT_DATE change to date format
    df.FDT_DATE = df.FDT_DATE.apply(lambda x:datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
    ## extract datetime value
    df['monthday'] = df.FDT_DATE.apply(lambda x:x.day)
    df['weekday'] = df.FDT_DATE.apply(lambda x:x.isocalendar()[2])
    df['quater'] = df.FDT_DATE.apply(lambda x:(x.hour*4 + x.minute/15))
    df['hour'] = df.FDT_DATE.apply(lambda x:x.hour)
    df['minute'] = df.FDT_DATE.apply(lambda x:x.minute)
    df['second'] = df.FDT_DATE.apply(lambda x:x.second)
    ## add some feature tricks
    return df
    
## One-Hot Encoder
def feature_ohencoder(df, n_category):
    ## Converts categorical variables which less than n_categories into dummy variables.
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = df.index)

    # Investigate each feature column for the data
    for col, col_data in df.iteritems():

        # If data type is categorical, convert to dummy variables
        if len(np.unique(col_data)) < n_category:
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

## Embedding
def feature_embedding(df, n_category):
    ## Converts categorical variables which more than n_categories with embedding matrix.
    
    # Initialize new output DataFrame
    df = df.reset_index().drop('index',axis=1)
    output = pd.DataFrame(index = df.index)

    # Investigate each feature column for the data
    for col, col_data in df.iteritems():

        # If data type is categorical, convert to dummy variables
        n_dim = len(np.unique(col_data))
        if n_dim > n_category:
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)
            # Embedding Matrix
            np.random.seed(42)
            matrix_emd = np.random.random((n_dim, n_category))
            col_data = pd.DataFrame(np.dot(col_data, matrix_emd))
        # Collect the revised columns
        output = output.join(col_data)
    
    return output


In [25]:
######## Continues Feature Function
## Selecting 10 POLYLINE points as trajecory （５ first, 5 last） and drop destination points
def extract_points(s):
    ## s is one polyline
    ## return 10 points, except destination
    return pd.Series({'lat0': s[0][0], 'lat1': s[1][0], 'lat2': s[2][0], 'lat3': s[3][0], 'lat4': s[4][0], 
                      'lat5': s[-6][0], 'lat6': s[-5][0], 'lat7': s[-4][0], 'lat8': s[-3][0], 'lat9': s[-2][0], 
                      'lon0': s[0][1], 'lon1': s[1][1], 'lon2': s[2][1], 'lon3': s[3][1], 'lon4': s[4][1], 
                      'lon5': s[-6][1], 'lon6': s[-5][1], 'lon7': s[-4][1], 'lon8': s[-3][1], 'lon9': s[-2][1]
                     })

def feature_points(df):
    ## Extract POLYLINE points
    return df.apply(extract_points)



In [26]:
######## Normalization and Other Process
## drop index and label
def feature_cleaning(df):
    return df.drop(['GROUP_ID','FSTR_ID','FDT_DATE','POLYLINE'], axis=1)

## calculate normalization
def normalize(feature):
    return (feature-np.min(feature))/(np.max(feature)-np.min(feature))

## apply normalization to numerical variables
def feature_normalization(df):
    return df.apply(normalize,axis=0)
   

## Data pipeline

In [27]:
## Load all trajecory data
df_all_trajecory = pd.read_csv('df_all_trajecory.csv')

## Select dataframe which has at least 2k points in POLYLINE
df_trajecory = select_data_trajecory(df_all_trajecory, k=5)

## Extract label and save
df_destination = df_trajecory.apply(lambda s:extract_dest_row(s), axis=1)
#df_destination.to_csv('all_label.csv',index=False)

## Use Meanshift to cluster destinations, use trajecory_meanshift to add cluster columns
df_destination = trajecory_meanshift(df_destination)

######## Categorical Feature
## Sort feature: datetime
df_trajecory = feature_sort_datetime(df_trajecory)
df_trajecory_categorical = df_trajecory.drop(['GROUP_ID','FSTR_ID','FDT_DATE','POLYLINE',
                                           'hour','minute','second'],axis=1)
## Sort feature: Categorical, less than 10 catercory turn to one-hot encoder
## and espescially for meta data, use embedding convert to 10 dims
df_trajecory_categorical = feature_ohencoder(df_trajecory_categorical, 10)
## Sort feature: Embedding, only for meta data
df_trajecory_categorical = feature_embedding(df_trajecory_categorical, 10)

######## Numerical Feature
## Sort feature: Extract points
df_trajecory_numerical = feature_points(df_trajecory.POLYLINE)
## Sort feature: Other numerical data
df_trajecory_numerical = pd.concat([df_trajecory_numerical, 
                                    df_trajecory.loc[:,['hour','minute','second']]],axis=1)
## Sort feature: Normalization
df_trajecory_numerical = feature_normalization(df_trajecory_numerical)
df_trajecory_numerical = df_trajecory_numerical.reset_index().drop('index',axis=1)

# ######## Feature Cleaning
# ## Sort feature: drop index
# df_trajecory = feature_cleaning(df_trajecory)

######## Merge Categorical and Numerical Feature
## Merge 
df_trajecory = pd.concat([df_trajecory_categorical,
                          df_trajecory_numerical],axis=1)


number of estimated clusters : 161


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-

In [28]:
######## Train-Test Split
X = df_trajecory.values
y = df_destination.label.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [31]:
######## Save Data
import h5py
# Train data
f = h5py.File("Train.hd5", "w")
f.create_dataset("data", data=X_train,  compression="gzip", compression_opts=4)
f.create_dataset("label", data=y_train,  compression="gzip", compression_opts=4)
f.close()
 
#Test data
 
f = h5py.File("Test.hd5", "w")
f.create_dataset("data", data=X_test,  compression="gzip", compression_opts=4)
f.create_dataset("label", data=y_test,  compression="gzip", compression_opts=4)
f.close()

In [1]:
# ######## Load Data
# import h5py
# # Train data
# # 读方式打开文件
# file=h5py.File('./hdf5/Train.hd5','r')
# # 尽管后面有 '[:]', 但是矩阵怎么进去的就是怎么出来的，不会被拉长（matlab后遗症）
# X_train = file['data'][:]
# y_train = file['label'][:]
# file.close()
    

# # Test data
# # 读方式打开文件
# file=h5py.File('./hdf5/Test.hd5','r')
# # 尽管后面有 '[:]', 但是矩阵怎么进去的就是怎么出来的，不会被拉长（matlab后遗症）
# X_test = file['data'][:]
# y_test = file['label'][:]
# file.close()

## Train Multilayer Perceptron

In [32]:
from keras.models import Sequential  
from keras.layers.core import Dense, Dropout, Activation  
from keras.optimizers import SGD    
from keras.utils import np_utils

model = Sequential()  
model.add(Dense(input_dim=X_train.shape[1], output_dim=500, init='glorot_uniform')) # 输入层， 2kpoints + embedding  
model.add(Activation('tanh')) # 激活函数是tanh  
model.add(Dropout(0.5)) # 采用50%的dropout

model.add(Dense(input_dim=500, output_dim=500, init='glorot_uniform')) # 隐层节点500个  
model.add(Activation('relu'))  
model.add(Dropout(0.5))

model.add(Dense(input_dim=500, output_dim=len(np.unique(y)), init='glorot_uniform')) # 输出结果类别数量就是维度 
# model.add(Dense(input_dim=500, output_dim=200, init='glorot_uniform')) # 输出结果类别数量就是维度 
model.add(Activation('softmax')) # 最后一层用softmax

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) # 设定学习率（lr）等参数  
model.compile(loss='categorical_crossentropy', optimizer=sgd, class_mode='categorical') # 使用Havasine距离作为loss函数

#### for softmax layer
nb_classes = len(np.unique(y))
# y_train = np_utils.to_categorical(y_train, nb_classes)
# y_test = np_utils.to_categorical(y_test, nb_classes)
y_train = (np.arange(nb_classes) == y_train[:, None]).astype(int) # 参考上一篇文章，这里需要把index转换成一个one hot的矩阵  
y_test = (np.arange(nb_classes) == y_test[:, None]).astype(int)


In [33]:
# 开始训练，这里参数比较多。batch_size就是batch_size，nb_epoch就是最多迭代的次数， shuffle就是是否把数据随机打乱之后再进行训练
# verbose是屏显模式，官方这么说的：verbose: 0 for no logging to stdout, 1 for progress bar logging, 2 for one log line per epoch.
# 就是说0是不屏显，1是显示一个进度条，2是每个epoch都显示一行数据
# show_accuracy就是显示每次迭代后的正确率
# validation_split就是拿出百分之多少用来做交叉验证
model.fit(X_train, y_train, batch_size=200, nb_epoch=100, 
          shuffle=True, verbose=1, show_accuracy=True, validation_split=0.3)  
print 'test set'  
model.evaluate(X_test, y_test, batch_size=200, show_accuracy=True, verbose=1)

Train on 219182 samples, validate on 93936 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
 21200/21

KeyboardInterrupt: 