In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import cluster
from scipy.cluster.hierarchy import dendrogram
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import os

In [2]:
def convertToProb(df_hit,p):
    series_sid1 = pd.Series([-1]).append(df_hit['SID'][:-1],ignore_index=True)
    queryBegin = (df_hit['SID'].reset_index(drop=True) - series_sid1)!=0

    series_th = pd.Series([0]).append(df_hit['Threshold'][:-1],ignore_index=True)
    series_th[queryBegin] = 0
    diff_th = (df_hit['Threshold'].reset_index(drop=True) - series_th)

    #getting max values of threshold of each query id
    grouped_th = df_hit[['SID','Threshold']].groupby('SID')
    grouped_th_max = grouped_th.transform(np.max)
    grouped_th_max_rsp = grouped_th_max.values.reshape((grouped_th_max.shape[0],))
    hit_weight = diff_th/grouped_th_max_rsp * (1-p)
    df_prob = df_hit.drop('Threshold',axis=1)
    df_prob.loc[:,1:] = np.array(df_prob.drop('SID',axis=1)) * hit_weight[:, np.newaxis]
    df_final_prob = df_prob.groupby('SID').sum()
    df_final_prob = df_final_prob + p
    return df_final_prob

In [3]:
#read data
def concatToNumpy(strPoint, idTraj, idPoint):
    point = strPoint.split(',')
    pointFloat = np.array([float(point[0]),float(point[0])])
    pointFloat = np.insert(pointFloat, 0, idTraj)
    pointFloat = np.insert(pointFloat, 1, idPoint)
    return pointFloat

In [4]:
def generateFeatures(allTraj,pivotNum=1):
    #findEnd
    seriesId1 = allTraj['tid'].append(pd.Series([-1]),ignore_index=True)
    seriesId2 = pd.Series([-1]).append(allTraj['tid'],ignore_index=True)
    endTrip = (seriesId2 - seriesId1) != 0
    endTrip = (endTrip[1:]).reset_index(drop=True)

    #find pivot
    nextPoint = allTraj.loc[1:,['x','y']].append(pd.DataFrame(np.array([[0,0]]),columns=['x','y'])).reset_index(drop=True)
    nextPoint = nextPoint.rename(columns={'x':'x1','y':'y1'})
    pivotSearch = pd.concat([allTraj, nextPoint], axis=1)
    pivotSearch = pivotSearch[~endTrip]
    pivotSearch['dist'] = np.sqrt(np.power((pivotSearch[['x','y']].values -  pivotSearch[['x1','y1']].values),2).sum(axis=1))
    pivots = pivotSearch.sort_values(by=['tid','dist']).groupby('tid').tail(pivotNum)[['tid','pid','x','y']]
    pivots = pivots.sort_values(by=['tid','pid'])

    feature = pd.DataFrame(np.array(range(0,int(allTraj['tid'].max()+1))),columns=['tid'])
    feature[['x_head','y_head']] = allTraj.groupby(['tid']).head(1)[['x','y']].reset_index(drop=True)
    feature[['x_tail','y_tail']] = allTraj.groupby(['tid']).tail(1)[['x','y']].reset_index(drop=True)
    for i in range(pivotNum):
        feature[['x_'+str(i),'y_'+str(i)]] = pivots.groupby('tid').nth(i)[['x','y']]
        setDefault = feature['x_'+str(i)].isna()
        if i < pivotNum/2:
            feature.loc[setDefault,['x_'+str(i),'y_'+str(i)]] = feature.loc[setDefault,['x_head','y_head']].values
        else:
            feature.loc[setDefault,['x_'+str(i),'y_'+str(i)]] = feature.loc[setDefault,['x_tail','y_tail']].values
    return feature

In [5]:
def plot_dendrogram(model, **kwargs):

    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [6]:
def prepareLearning(num_features,data):
    
    # input data 
    train_dataset = data.iloc[:,1:1+num_features]
    train_labels = data.iloc[:,1+num_features:]
    #test_dataset = mnist.test.images 
    #test_labels = mnist.test.labels 
    valid_dataset = data.iloc[:,1:1+num_features] 
    valid_labels = data.iloc[:,1+num_features:]
    return train_dataset, train_labels, valid_dataset,valid_labels

In [8]:
df = pd.read_csv("../HelloLater.csv",sep=";",index_col=False)


exampleTraj = open("../examples/src/main/resources/trajectory.txt")
i = 0
allTraj = [np.array([-1,-1,-1,-1])]
for line in exampleTraj:
    points = line.split(";")
    idTraj = np.full((len(points)),i)
    idPoints = range(len(points))
    traj = [concatToNumpy(point,i,idPoint) for point,idPoint in zip(points,idPoints)]
    i = i+1
    allTraj = np.concatenate((allTraj,traj))
allTraj = allTraj[1:]
allTraj = pd.DataFrame(allTraj,columns=['tid','pid','x','y'])

In [24]:
def buildDataset(df, max_threshold, num_clusters, numPartition, p):
    df_global = df[['SID','Threshold','GlobalID']]
    df_global_limited = df_global.loc[df_global['Threshold']<=max_threshold]
    df_global_limited = df_global_limited.reset_index(drop=True)

    df_global_limited['hit'] = 1
    df_global_threshold = df_global_limited.groupby(['SID','Threshold','GlobalID']).count().unstack().reset_index()
    df_global_threshold['hit'] = df_global_threshold['hit'].notna() * 1

    agglo = cluster.FeatureAgglomeration(n_clusters=num_clusters)
    agglo.fit( df_global_threshold.loc[df_global_threshold['Threshold']==5,['hit']]['hit'])
    agglolabel = np.array(agglo.labels_).reshape(-1,1)
    agglolabel = np.tile(agglolabel,num_clusters)
    clusters = np.array(range(num_clusters))
    clusters = np.repeat(clusters,(numPartition)).reshape(num_clusters,numPartition).transpose()
    clusterer = ((agglolabel == clusters)*1)

    clustered_hit = (np.matmul(df_global_threshold['hit'].astype(float),clusterer) >0 )*1

    df_global_thresholdc = pd.DataFrame()
    df_global_thresholdc['SID'] = df_global_threshold['SID']
    df_global_thresholdc['Threshold'] = df_global_threshold['Threshold']
    df_global_thresholdc = pd.concat([df_global_thresholdc, pd.DataFrame(clustered_hit,columns=range(num_clusters))],axis=1)

    df_globals = {}
    df_prob = {}
    
    df_globals[0] = df_global_thresholdc
    df_prob[0] = convertToProb(df_global_thresholdc,p)
    
    for c in range(num_clusters):
        col_select = np.insert(np.array(agglo.labels_==c),0,[True,True])
        df_globals[1+c] = df_global_threshold.loc[:,col_select]
        df_prob[1+c] = convertToProb(df_globals[1+c],p)
    return agglo.labels_, df_global_threshold, df_globals, df_prob

In [31]:
def create_model(train_dataset, train_labels,num_features,num_labels):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(30, input_shape=(num_features,), activation=tf.nn.relu, name="input1"),
        tf.keras.layers.Dense(num_labels, activation= tf.nn.sigmoid, name="input2")
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0005),
                  loss=tf.keras.losses.MeanAbsoluteError(), metrics=['accuracy'])
    model.fit(train_dataset, train_labels, epochs=1000,verbose=0)
    print(model.evaluate(train_dataset, train_labels))
    return model

In [38]:
def buildModel(df_probs, num_clusters, num_pivot):

    models = {}
    
    feature = generateFeatures(allTraj,num_pivot)
    
    data = pd.concat([feature, df_probs[0]], axis=1,join="inner")
    data = data.reset_index(drop=True)
    
    # number of features 
    num_features = 4 + (num_pivot*2)
    
    # number of target labels 
    num_labels = num_clusters
    
    train_dataset, train_labels, _,_ =  prepareLearning(num_features,data)      
    
    models[0] = create_model(train_dataset, train_labels,num_features,num_labels)
    
    for c in range(num_clusters):
        data = pd.concat([feature, df_probs[1+c]], axis=1,join="inner")
        data = data.reset_index(drop=True)
        
        num_labels = df_probs[1+c].shape[1]

        train_dataset, train_labels, _,_ =  prepareLearning(num_features, data)
        
        print("training cluster "+str(c))
        models[1+c] = create_model(train_dataset, train_labels,num_features,num_labels)
        
    return models

In [165]:
def buildModelSingle(df_prob, num_labels, num_pivot):
    feature = generateFeatures(allTraj,num_pivot)
    
    data = pd.concat([feature, df_prob], axis=1,join="inner")
    data = data.reset_index(drop=True)
    
    # number of features 
    num_features = 4 + (num_pivot*2)
    
    train_dataset, train_labels, _,_ =  prepareLearning(num_features,data)      
    
    modelSingle = create_model(train_dataset, train_labels,num_features,num_labels)

    return modelSingle

In [196]:
def create_model_multilabel(train_dataset, train_labels,num_features,num_labels):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(30, input_shape=(num_features,), activation=tf.nn.relu, name="input1"),
        tf.keras.layers.Dense(num_labels, activation= tf.nn.sigmoid, name="input2")
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0005),
                  loss=tf.nn.softmax_cross_entropy_with_logits, metrics=['accuracy'])
    model.fit(train_dataset, train_labels, epochs=1000,verbose=0)
    print(model.evaluate(train_dataset, train_labels))
    return model

In [None]:
def buildModelSingleMultiLabel(df_ml, num_labels, num_pivot):
    feature = generateFeatures(allTraj,num_pivot)
    
    data = pd.concat([feature, df_prob], axis=1,join="inner")
    data = data.reset_index(drop=True)
    
    # number of features 
    num_features = 4 + (num_pivot*2)
    
    train_dataset, train_labels, _,_ =  prepareLearning(num_features,data)      
    
    modelSingle = create_model(train_dataset, train_labels,num_features,num_labels)

    return modelSingle

In [199]:
df_main.reset_index(drop=True)

Unnamed: 0_level_0,SID,Threshold,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit
GlobalID,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,2,3,4,5,6,7,...,71,72,73,74,75,76,77,78,79,80
0,0,0.001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0.005,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0.010,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0.050,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0.100,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0.500,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1.000,0,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
7,0,2.500,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,0,0
8,0,5.000,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,0,0
9,0,7.500,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [198]:
pd.merge(df_main.reset_index(drop=True), feature, left_on='SID', right_on='tid')



Unnamed: 0,"(SID, )","(Threshold, )","(hit, 0)","(hit, 1)","(hit, 2)","(hit, 3)","(hit, 4)","(hit, 5)","(hit, 6)","(hit, 7)",...,"(hit, 78)","(hit, 79)","(hit, 80)",tid,x_head,y_head,x_tail,y_tail,x_0,y_0
0,0,0.001,0,0,0,0,0,0,0,0,...,0,0,0,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
1,0,0.005,0,0,0,0,0,0,0,0,...,0,0,0,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
2,0,0.010,0,0,0,0,0,0,0,0,...,0,0,0,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
3,0,0.050,0,0,0,0,0,0,0,0,...,0,0,0,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
4,0,0.100,0,0,0,0,0,0,0,0,...,0,0,0,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
5,0,0.500,0,0,0,0,0,0,0,0,...,0,0,0,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
6,0,1.000,0,0,0,1,0,1,1,1,...,0,0,0,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
7,0,2.500,1,1,1,1,1,1,1,1,...,1,0,0,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
8,0,5.000,1,1,1,1,1,1,1,1,...,1,0,0,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
9,0,7.500,1,1,1,1,1,1,1,1,...,1,1,1,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783


In [166]:
df_prob0 = convertToProb(df_main,0.005)
modelSingle = buildModelSingle(df_prob0, 81, 1)

[0.10982807935805618, 0.020999275]


In [35]:
agglo_labels, df_main, df_clusters, df_prob_clusters = buildDataset(df, 10, 5, 81, 0.005)
feature = generateFeatures(allTraj,num_pivot)
models_prob_dist1 = buildModel(df_prob_clusters, 5, 1)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


[0.30659685919977114, 0.59594494]
training cluster 0
[0.17574080693212477, 0.07530775]
training cluster 1
[0.19515321687522097, 0.0043446776]
training cluster 2
[0.07438285861388226, 0.020275163]
training cluster 3
[0.15727005907584582, 0.10789283]
training cluster 4
[0.23483560229452688, 0.00072411296]


In [182]:
def predict_dist0(df_sid_th,model0, df_features, max_threshold, p):
    sids = np.array(df_sid_th['SID'])
    
    df_sid_th['pThreshold'] = (max_threshold - df_sid_th['Threshold'])/max_threshold * (1-p) + p
    more_threshold = df_sid_th['Threshold'] > max_threshold
    if more_threshold.sum()>0:
        df_sid_th.loc[more_threshold, 'pThreshold'] = (df_sid_th.loc[more_threshold, 'Threshold'] - 
                                                        max_threshold)/df_sid_th.loc[more_threshold, 'Threshold'] * p
    
    data_features = df_features.iloc[sids].reset_index(drop=True)
    X_ = data_features.iloc[:,1:]
    y_predict = model0.predict(X_)
    
    
    bc_threshold = np.repeat(df_sid['pThreshold'].values.reshape(-1,1),y_predict.shape[1],axis=1)
    f_clusters = (y_predict >= bc_threshold) * 1
    
    
    print(bc_threshold[:5,:10])
    print(y_predict[:5,:10])
    print(f_clusters[:5,:10])
    
    if len(bc_threshold) > 0:
        return df_sid_th
    df_sid_th.iloc[:,2+label_in_clusters] = f_clusters
    return df_sid_th

In [183]:
df_sid = df_main.iloc[100:150]
df_sid.loc[:,2:] = 0

pred_result0 = predict_dist0(df_sid,modelSingle, feature,10,0.005)
precision_score(df_main.iloc[100:150,2:].astype(int), pred_result0.iloc[:,2:-1].astype(int),average='weighted')

[[0.75125   0.75125   0.75125   0.75125   0.75125   0.75125   0.75125
  0.75125   0.75125   0.75125  ]
 [0.5025    0.5025    0.5025    0.5025    0.5025    0.5025    0.5025
  0.5025    0.5025    0.5025   ]
 [0.25375   0.25375   0.25375   0.25375   0.25375   0.25375   0.25375
  0.25375   0.25375   0.25375  ]
 [0.005     0.005     0.005     0.005     0.005     0.005     0.005
  0.005     0.005     0.005    ]
 [0.9999005 0.9999005 0.9999005 0.9999005 0.9999005 0.9999005 0.9999005
  0.9999005 0.9999005 0.9999005]]
[[0.7498924  0.7525251  0.898261   0.75218487 0.9006065  0.8971565
  0.8997523  0.8991964  0.89904594 0.8991199 ]
 [0.7498924  0.7525251  0.898261   0.75218487 0.9006065  0.8971565
  0.8997523  0.8991964  0.89904594 0.8991199 ]
 [0.7498924  0.7525251  0.898261   0.75218487 0.9006065  0.8971565
  0.8997523  0.8991964  0.89904594 0.8991199 ]
 [0.7498924  0.7525251  0.898261   0.75218487 0.9006065  0.8971565
  0.8997523  0.8991964  0.89904594 0.8991199 ]
 [0.75278276 0.7532802  0.900

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


0.0

In [181]:
pred_result0.iloc[:5,2:11]

Unnamed: 0_level_0,hit,hit,hit,hit,hit,hit,hit,hit,hit
GlobalID,0,1,2,3,4,5,6,7,8
100,0,0,0,0,0,0,0,0,0
101,0,0,0,0,0,0,0,0,0
102,0,0,0,0,0,0,0,0,0
103,0,0,0,0,0,0,0,0,0
104,0,0,0,0,0,0,0,0,0


In [180]:
df_main.iloc[100:105,2:11]

Unnamed: 0_level_0,hit,hit,hit,hit,hit,hit,hit,hit,hit
GlobalID,0,1,2,3,4,5,6,7,8
100,0,0,1,0,0,0,0,0,0
101,0,1,1,0,0,1,0,1,1
102,1,1,1,1,1,1,1,1,1
103,1,1,1,1,1,1,1,1,1
104,0,0,0,0,0,0,0,0,0


In [171]:
pred_result0

Unnamed: 0_level_0,SID,Threshold,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,hit,pThreshold
GlobalID,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,2,3,4,5,6,7,...,72,73,74,75,76,77,78,79,80,Unnamed: 21_level_1
100,10,2.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.75125
101,10,5.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.5025
102,10,7.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.25375
103,10,10.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.005
104,11,0.001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.9999
105,11,0.005,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.999502
106,11,0.01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.999005
107,11,0.05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.995025
108,11,0.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.99005
109,11,0.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.95025


In [134]:
def predict_dist1(df_sid_th,models, df_features, max_threshold, p):
    sids = np.array(df_sid_th['SID'])
    
    df_sid_th['pThreshold'] = (max_threshold - df_sid_th['Threshold'])/max_threshold * (1-p) + p
    more_threshold = df_sid_th['Threshold'] > max_threshold
    if more_threshold.sum()>0:
        df_sid_th.loc[more_threshold, 'pThreshold'] = (df_sid_th.loc[more_threshold, 'Threshold'] - 
                                                        max_threshold)/df_sid_th.loc[more_threshold, 'Threshold'] * p
    
    data_features = df_features.iloc[sids].reset_index(drop=True)
    X_ = data_features.iloc[:,1:]
    y_predict = models[0].predict(X_)
    
    bc_threshold = np.repeat(df_sid['pThreshold'].values.reshape(-1,1),y_predict.shape[1],axis=1)
    f_clusters = (y_predict >= bc_threshold)

    for i in range(5):
        cluster_loc = (f_clusters[:,i] == True)
        X_c = X_.loc[cluster_loc]
        df_sid_th_c = df_sid_th.loc[cluster_loc,['SID','pThreshold']]
        y_predict_c = models[i+1].predict(X_c)
        label_in_clusters = np.where(agglo_labels == i)[0]
        
        if isinstance(y_predict_c, list):
            bc_threshold_c = df_sid_th_c['pThreshold'].values.reshape(-1,1)
        else:
            bc_threshold_c = np.repeat(df_sid_th_c['pThreshold'].values
                                   .reshape(-1,1),y_predict_c.shape[1],axis=1)
            
        f_clusters_c = (y_predict_c >= bc_threshold_c) * 1
        
        if len(bc_threshold_c) == 0:
            continue
        df_sid_th.iloc[cluster_loc,2+label_in_clusters] = f_clusters_c
    return df_sid_th

In [135]:
df_sid = df_main.iloc[100:150]
df_sid.loc[:,2:] = 0

pred_result = predict_dist1(df_sid,models_prob_dist1, feature,10,0.005)
precision_score(df_main.iloc[100:150,2:].astype(int), pred_result.iloc[:,2:-1].astype(int),average='weighted')  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [138]:
df_main.iloc[100:105,5:10]

Unnamed: 0_level_0,hit,hit,hit,hit,hit
GlobalID,3,4,5,6,7
100,0,0,0,0,0
101,0,0,1,0,1
102,1,1,1,1,1
103,1,1,1,1,1
104,0,0,0,0,0


In [140]:
pred_result.iloc[:5,5:10]

Unnamed: 0_level_0,hit,hit,hit,hit,hit
GlobalID,3,4,5,6,7
100,1,1,1,1,1
101,1,1,1,1,1
102,1,1,1,1,1
103,1,1,1,1,1
104,0,0,0,0,0


In [156]:
from sklearn.metrics import precision_score, recall_score, accuracy_score



  'precision', 'predicted', average, warn_for)


0.6815619601010078

In [42]:
    
feature = generateFeatures(allTraj,1)
data = pd.concat([feature, df_prob_clusters[0]], axis=1,join="inner")
data = data.reset_index(drop=True)

# number of features 
num_features = 4 + (1*2)

# number of target labels 
num_labels = 5

train_dataset, train_labels, _,_ =  prepareLearning(num_features,data)     

X_tensor = train_dataset.loc[3:3,:].values
y_tensor = train_labels.loc[3:3,:].values

y_predict = models_prob_dist1[0].predict(train_dataset)

In [43]:
y_predict

array([[7.6950073e-01, 9.0267682e-01, 1.0000000e+00, 0.0000000e+00,
        0.0000000e+00],
       [7.7120399e-01, 9.0306771e-01, 1.0000000e+00, 0.0000000e+00,
        0.0000000e+00],
       [7.6904267e-01, 9.0219307e-01, 1.0000000e+00, 0.0000000e+00,
        0.0000000e+00],
       ...,
       [7.6769722e-01, 9.0169990e-01, 1.0000000e+00, 0.0000000e+00,
        0.0000000e+00],
       [7.7043104e-01, 9.0289736e-01, 1.0000000e+00, 0.0000000e+00,
        0.0000000e+00],
       [7.7111262e-01, 9.0311474e-01, 1.0000000e+00, 0.0000000e+00,
        2.6973174e-12]], dtype=float32)

In [275]:
for i in range(6):
    dirname = "./saved_models"+str(i)
    if os.path.exists(dirname) is False:
        os.makedirs(dirname)
    saved_model_path = tf.contrib.saved_model.save_keras_model(models[i], dirname)


Consider using a TensorFlow optimizer from `tf.train`.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['serving_default']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: ./saved_models0\1561471647\saved_model.pb

Consider using a TensorFlow optimizer from `tf.train`.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['serving_default']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INF

In [276]:
models[0].summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input1 (Dense)               (None, 30)                210       
_________________________________________________________________
input2 (Dense)               (None, 5)                 155       
Total params: 365
Trainable params: 365
Non-trainable params: 0
_________________________________________________________________



Consider using a TensorFlow optimizer from `tf.train`.
Instructions for updating:
Use tf.train.CheckpointManager to manage checkpoints rather than manually editing the Checkpoint proto.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['serving_default']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: ./saved_models0\1561466302\saved_model.pb

Consider using a TensorFlow optimizer from `tf.train`.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tens

INFO:tensorflow:Signatures INCLUDED in export for Predict: ['serving_default']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: ./saved_models6\1561466310\saved_model.pb

Consider using a TensorFlow optimizer from `tf.train`.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['serving_default']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: ./saved_models7\1561466311\saved_model.pb

Consider using a TensorFlow optimizer from `tf.train`.
INFO:tensorflow:Signatures INCLUDED in export for C

In [256]:
new_model = tf.contrib.saved_model.load_keras_model("./saved_models0/1561466302")
new_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_136 (Dense)            (None, 30)                210       
_________________________________________________________________
dense_137 (Dense)            (None, 9)                 279       
Total params: 489
Trainable params: 489
Non-trainable params: 0
_________________________________________________________________


In [216]:
checkpoint_directory = './SimpleNN/'
# Create model checkpoint
optimizer = tf.train.AdamOptimizer(0.001)
for i in range(10):
    checkpoint_directory = './SimpleNN/'+str(i)+'/'
    checkpoint = tfe.Checkpoint(optimizer=optimizer,
                                model=models[i],
                                optimizer_step=tf.train.get_or_create_global_step())
    checkpoint.save(file_prefix=checkpoint_directory)

In [121]:

model = simple_nn(5)
optimizer = tf.train.AdamOptimizer(0.001)
checkpoint_directory = './SimpleNN/0/'
checkpoint = tfe.Checkpoint(optimizer=optimizer,
                            model=model,
                            optimizer_step=tf.train.get_or_create_global_step())
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))

<tensorflow.python.training.checkpointable.util.CheckpointLoadStatus at 0x1e691a99eb8>

In [225]:
feature = generateFeatures(allTraj,1)
num_pivot=1
df_prob = convertToProb(df_global,p)
data = pd.concat([feature, df_prob], axis=1,join="inner")
data = data.reset_index(drop=True)
num_features = 4 + (num_pivot*2)

# number of target labels 
num_labels = 9

train_dataset, train_labels, valid_dataset,valid_labels =  prepareLearning(num_features,data)      



In [226]:
train_labels

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.50250,1.000000,0.95025,0.950250,0.75125,0.50250,0.990050,0.950250,0.95025
1,0.75125,0.950250,0.90050,1.000000,0.50250,0.75125,0.950250,0.950250,0.95025
2,0.50250,0.950250,0.95025,1.000000,0.50250,0.75125,0.950250,0.990050,0.95025
3,0.50250,0.900500,0.90050,1.000000,0.25375,0.75125,0.900500,0.900500,0.90050
4,0.90050,0.950250,0.90050,1.000000,0.50250,0.90050,0.900500,0.950250,0.95025
5,0.75125,0.900500,0.75125,1.000000,0.50250,0.75125,0.900500,0.950250,0.90050
6,0.50250,0.900500,0.90050,1.000000,0.25375,0.75125,0.900500,0.900500,0.90050
7,0.99005,0.751250,0.50250,1.000000,0.25375,0.50250,0.502500,0.751250,0.75125
8,0.00500,0.005000,0.00500,0.253750,0.00500,0.00500,1.000000,0.005000,0.00500
9,0.00500,0.502500,0.50250,0.502500,0.00500,0.00500,0.502500,0.005000,0.25375


In [339]:
models[0]

<tensorflow.python.keras.engine.sequential.Sequential at 0x1e7d04efcc0>

In [340]:
train_dataset

Unnamed: 0,x_head,y_head,x_tail,y_tail,x_0,y_0
0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
1,39.90604,39.90604,39.94996,39.94996,39.92334,39.92334
2,39.93701,39.93701,39.90619,39.90619,39.93701,39.93701
3,39.96065,39.96065,39.93499,39.93499,39.95656,39.95656
4,39.85459,39.85459,39.96066,39.96066,39.87849,39.87849
5,39.91617,39.91617,39.84390,39.84390,39.90359,39.90359
6,39.73690,39.73690,39.91618,39.91618,39.82257,39.82257
7,39.62692,39.62692,39.72508,39.72508,39.62729,39.62729
8,39.62692,39.62692,39.62692,39.62692,39.62692,39.62692
9,39.62692,39.62692,39.62692,39.62692,39.62692,39.62692


In [36]:
X_tensor = train_dataset.loc[3:3,:].values
y_tensor = train_labels.loc[3:3,:].values

y_predict = models[0].predict(train_dataset)

NameError: name 'train_dataset' is not defined

In [131]:
X_tensor

<tf.Tensor: id=13888592, shape=(1, 6), dtype=float64, numpy=array([[39.96065, 39.96065, 39.93499, 39.93499, 39.95656, 39.95656]])>

In [130]:
y_predict

array([[0.78047559, 0.89978902, 0.99998868, 0.95719961, 0.51369776]])

In [138]:
y_tensor

<tf.Tensor: id=13888593, shape=(1, 5), dtype=float64, numpy=array([[0.75125, 0.9005 , 1.     , 0.9005 , 0.25375]])>

In [331]:
def predict_dist1(df_sid_th,models, data):
    sids = np.array(df_sid_th['SID'])
    data_features = data.iloc[sids].reset_index(drop=True)
    X_ = data_features.iloc[:,1:]
    y_predict = models[0].predict(X_)
    
    f_clusters = (y_predict >= threshold_prob)[0]

    hits = []
    for i in range(5):
        if f_clusters[i] is False:
            continue
        y_i_predict = models[i+1].predict(X_)
        label_in_clusters = np.where(agglo_labels == i)[0]
        hits.append(label_in_clusters[y_i_predict[0] >= threshold_prob])
    return hits

In [311]:
df_sid_thx

Unnamed: 0,SID,Threshold
0,4.0,0.1
1,5.0,0.005
2,4.0,0.3


In [314]:
feature.iloc[df_sid_thx['SID']].reset_index(drop=True)

Unnamed: 0,tid,x_head,y_head,x_tail,y_tail,x_0,y_0
0,4,39.85459,39.85459,39.96066,39.96066,39.87849,39.87849
1,5,39.91617,39.91617,39.8439,39.8439,39.90359,39.90359
2,4,39.85459,39.85459,39.96066,39.96066,39.87849,39.87849


In [332]:
predict_dist1(df_sid_thx,models, feature)

CancelledError:  [Op:StatefulPartitionedCall]

In [297]:
feature = generateFeatures(allTraj,1)
feature

Unnamed: 0,tid,x_head,y_head,x_tail,y_tail,x_0,y_0
0,0,39.95016,39.95016,39.92219,39.92219,39.93783,39.93783
1,1,39.90604,39.90604,39.94996,39.94996,39.92334,39.92334
2,2,39.93701,39.93701,39.90619,39.90619,39.93701,39.93701
3,3,39.96065,39.96065,39.93499,39.93499,39.95656,39.95656
4,4,39.85459,39.85459,39.96066,39.96066,39.87849,39.87849
5,5,39.91617,39.91617,39.84390,39.84390,39.90359,39.90359
6,6,39.73690,39.73690,39.91618,39.91618,39.82257,39.82257
7,7,39.62692,39.62692,39.72508,39.72508,39.62729,39.62729
8,8,39.62692,39.62692,39.62692,39.62692,39.62692,39.62692
9,9,39.62692,39.62692,39.62692,39.62692,39.62692,39.62692


In [296]:
df_global.columns

Index(['SID', 'Threshold', 0, 1, 2, 3, 4], dtype='object')

In [325]:
p = 0.005

In [326]:
threshold_i = 2.5
threshold_prob = (10-threshold_i)/10 * (1-p)

X_tensor = tf.constant(train_dataset.loc[3:3,:].values)
y_tensor = tf.constant(train_labels.loc[3:3,:].values)

y_predict = models[0].predict(X_tensor).numpy()

f_clusters = (y_predict >= threshold_prob)[0]

num_hit = 0
for i in range(5):
    if f_clusters[i] is False:
        continue
    y_i_predict = models[i+1].predict(X_tensor).numpy()
    label_in_clusters = np.where(agglo_labels == i)[0]
    hits = label_in_clusters[y_i_predict[0] >= threshold_prob] 
    print(label_in_clusters[y_i_predict[0] >= threshold_prob])
    num_hit = num_hit + len(hits)
    print(num_hit)
    

InvalidArgumentError: In[0] is not a matrix
	 [[{{node MatMul_48}}]] [Op:StatefulPartitionedCall]

In [195]:
df_global = df[['SID','Threshold','GlobalID']]
df_global_limited = df_global.loc[df_global['Threshold']<=max_threshold]
df_global_limited = df_global_limited.reset_index(drop=True)

df_global_limited['hit'] = 1
df_global_threshold = df_global_limited.groupby(['SID','Threshold','GlobalID']).count().unstack().reset_index()
df_global_threshold['hit'] = df_global_threshold['hit'].notna() * 1

In [209]:
df_global_threshold.loc[((df_global_threshold['SID']==3)&(df_global_threshold['Threshold']==2.5)),:].sum(axis=1)

40    41.5
dtype: float64

In [205]:
(df_global_threshold['SID']==3)and(df_global_threshold['Threshold']==2.5)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [206]:
(df_global_threshold['Threshold']==2.5)

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7         True
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18        True
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29        True
         ...  
14995    False
14996    False
14997    False
14998    False
14999     True
15000    False
15001    False
15002    False
15003    False
15004    False
15005    False
15006    False
15007    False
15008    False
15009    False
15010     True
15011    False
15012    False
15013    False
15014    False
15015    False
15016    False
15017    False
15018    False
15019    False
15020    False
15021     True
15022    False
15023    False
15024    False
Name: Threshold, Length: 15025, dtype: bool

In [180]:
agglo_labels

array([1, 1, 2, 1, 1, 2, 1, 1, 3, 3, 2, 2, 3, 3, 2, 2, 3, 2, 2, 2, 0, 2,
       0, 0, 2, 2, 2, 1, 1, 2, 4, 1, 3, 1, 1, 3, 1, 3, 2, 3, 3, 2, 1, 3,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 3, 2,
       3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 3, 0, 0], dtype=int64)

In [283]:
a = df_global['SID'].unique()

In [288]:
s = a[np.random.choice(len(a), size=100, replace=False)]

In [289]:
s

array([  63,  710,  282,  683, 1456, 1460, 1698, 1858, 1028, 1934,  682,
        945,  879, 1611, 1676, 1681, 1735, 1856, 1761, 1031, 1967,  685,
        264, 1620,  824, 1961,  601,  677,  124, 1437,   57, 1779,   67,
       1467, 1533, 1847, 1092, 1738,  212, 1845, 1622, 1947,  325, 1831,
       1978,  430,   81,  462, 1806, 1527, 1455, 1424,  304, 1640,  185,
        657,  205, 1597,  756, 1033, 1286, 1438,  440, 1563, 1767, 1512,
       1579,  386, 1491,  232, 1494,  371,   91,  585,  312,  532, 1987,
        182,  414,  503, 1998, 1247,  491, 1423,  347,  979, 1878, 1874,
       1749,  536, 1144,  280,  438,  770, 1628, 1921, 1706, 1747,  287,
        164], dtype=int64)

In [185]:
max(df_main['SID'].unique())


1999

In [195]:
q = feature.loc[feature['tid']>1999,'tid'].values
q[np.random.choice(len(q), size=100, replace=False)]

array([7410, 9788, 8402, 7830, 2109, 3231, 2310, 2546, 7930, 6048, 4933,
       8119, 5249, 5917, 3846, 5800, 4692, 8151, 5794, 7334, 8282, 9002,
       8457, 7711, 7566, 6381, 8112, 5551, 5285, 4796, 6130, 8220, 5703,
       4469, 3622, 4053, 6010, 2250, 6788, 4842, 7986, 2462, 6920, 4191,
       2091, 6108, 5615, 7755, 6803, 9195, 3419, 3326, 9051, 3037, 5152,
       7640, 2699, 5501, 8978, 2353, 2875, 5257, 7147, 2731, 8475, 6949,
       7180, 3727, 9837, 8586, 9378, 9045, 7425, 9906, 7714, 4442, 4630,
       3035, 3473, 4038, 3098, 2321, 7404, 3216, 6867, 7258, 5333, 7803,
       5040, 4584, 8751, 6738, 8889, 5253, 3225, 2925, 6170, 8754, 4567,
       3790])

In [307]:
a = np.array([1,2,4,1,3])
np.isin(a,[5,6,1,3])

array([ True, False, False,  True,  True])

In [294]:
import sys
print(sys.version)

3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 18:50:55) [MSC v.1915 64 bit (AMD64)]


In [295]:
dfx = pd.DataFrame(None, columns=['a','b'])
dfx

Unnamed: 0,a,b
