In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from shapely.geometry import Polygon, Point, LinearRing, box
import matplotlib.cm as cmx
# import seaborn as sns; sns.set()
import math
%matplotlib notebook

In [None]:
poly = box(3750901.5068,-19268905.6133, 3770901.5068, -19208905.6133)

def city_distance(x, y):
    if ((x >= 3750901.5068) & (x <= 3770901.5068) & (y >= -19268905.6133) & (y <= -19208905.6133)):
        point = Point(x, y)
        pol_ext = LinearRing(poly.exterior.coords)
        d = pol_ext.project(point)
        p = pol_ext.interpolate(d)
        closest_point_coords = list(p.coords)[0]
        
        return -point.distance(Point(closest_point_coords))
    else:
        point = Point(x, y)
        pol_ext = LinearRing(poly.exterior.coords)
        d = pol_ext.project(point)
        p = pol_ext.interpolate(d)
        closest_point_coords = list(p.coords)[0]
        
        return point.distance(Point(closest_point_coords))

In [None]:
df_train = pd.read_csv("../../data/data_train.csv")
df_test = pd.read_csv("../../data/data_test.csv")
df_train['time_entry'] = pd.to_timedelta(df_train['time_entry'])
df_train['time_exit'] = pd.to_timedelta(df_train['time_exit'])
df_test['time_entry'] = pd.to_timedelta(df_test['time_entry'])
df_test['time_exit'] = pd.to_timedelta(df_test['time_exit'])
df_train['duration'] = df_train['time_exit'] - df_train['time_entry']
df_train['duration'] = df_train['duration'].dt.total_seconds()
df_test['duration'] = df_test['time_exit'] - df_test['time_entry']
df_test['duration'] = df_test['duration'].dt.total_seconds()
df_train['distance'] = df_train.apply(lambda row: Point(row['x_entry'],row['y_entry']).
                                      distance(Point(row['x_exit'],row['y_exit'])), axis=1)
df_train['velocity'] = df_train.apply(lambda row: (row['distance']/row['duration'])
                                      if row['duration']!= 0.0 else 0.0, axis=1)
df_test['distance'] = df_test.apply(lambda row: Point(row['x_entry'],row['y_entry']).
                                      distance(Point(row['x_exit'],row['y_exit'])) if ~np.isnan(row['x_exit'])
                                     else np.nan, axis=1)
df_test['velocity'] = df_test.apply(lambda row: (row['distance']/row['duration'])
                                      if (row['duration']!= 0.0 and ~np.isnan(row['x_exit'])) else 0.0, axis=1)

In [None]:
df_test['city_distance'] = df_test.apply(lambda row: city_distance(row['x_entry'], row['y_entry']), axis = 1)
df_train['city_distance'] = df_train.apply(lambda row: city_distance(row['x_entry'], row['y_entry']), axis = 1)
print("*****City Distance done*****")
df_train['ratio'] = df_train.apply(lambda row: (row['city_distance']/row['duration'])
                                      if row['duration']!= 0.0 else 0.0, axis=1)
df_test['ratio'] = df_test.apply(lambda row: (row['city_distance']/row['duration'])
                                      if row['duration']!= 0.0 else 0.0, axis=1)
print("*****Ratio done*****")
df_train['label']= -1
df_train['label'] = df_train.apply(lambda row: 1 
                                 if ((row['x_exit'] >= 3750901.5068) & (row['x_exit'] <= 3770901.5068) & 
                                     (row['y_exit'] >= -19268905.6133) & (row['y_exit'] <= -19208905.6133)) 
                                 else 0, axis=1)
df_test['label']= -1
df_test['label'] = df_test.apply(lambda row: 1 
                                 if ((row['duration'] == 0.0)&(math.isnan(row['x_exit'])) & (math.isnan(row['y_exit'])) &
                                      (row['x_entry'] >= 3750901.5068) & (row['x_entry'] <= 3770901.5068) & 
                                      (row['y_entry'] >= -19268905.6133) & (row['y_entry'] <= -19208905.6133)) 
                                 else row['label'], axis=1)
df_test['label'] = df_test.apply(lambda row: 0 
                                 if ((row['duration'] == 0.0)&(math.isnan(row['x_exit'])) & (math.isnan(row['y_exit'])) &
                                     (row['label'] == -1))
                                 else row['label'], axis=1)
print("*****Label done*****")

In [5]:
df_train.to_pickle('df_train')
df_test.to_pickle('df_test')

In [6]:
test = df_test[(df_test['duration'] != 0.0) & (df_test.x_exit.isnull()) & (df_test.y_exit.isnull())]
# start_time = '0 days 15:00:00'
# end_time = '0 days 16:00:00'
# dd = df_train[(df_train['time_exit'] >= start_time) & (df_train['time_exit'] <= end_time)]
# dd[(dd['duration'] == 0.0) & (dd['x_exit'] >= 3750901.5068) & 
#          (dd['x_exit'] <= 3770901.5068) & (dd['y_exit'] >= -19268905.6133) & 
#          (dd['y_exit'] <= -19208905.6133)].label.value_counts()
# dd[(dd['duration'] == 0.0) & ((dd['x_exit'] < 3750901.5068) | 
#          (dd['x_exit'] > 3770901.5068)) & ((dd['y_exit'] < -19268905.6133) | 
#          (dd['y_exit'] > -19208905.6133))].label.value_counts()
# dd_1 = dd[(dd['duration'] != 0.0) & (dd['x_exit'] >= 3750901.5068) & 
#          (dd['x_exit'] <= 3770901.5068) & (dd['y_exit'] >= -19268905.6133) & 
#          (dd['y_exit'] <= -19208905.6133)]
train = df_train[df_train['duration'] != 0.0]

In [None]:
train.head()

In [7]:
train.to_pickle('train')
test.to_pickle('test')

In [None]:
df = df_train.groupby('hash')
df_list = list(df)
df_2 = df_test.groupby('hash')
df_2_list = list(df_2)
# df.to_pickle('df_list')
# df_2.to_pickle('df_2_list')

In [2]:
df_train = pd.read_pickle('df_train')
df_test = pd.read_pickle('df_test')

In [4]:
df_train['count'] = df_train.groupby('hash')['hash'].transform('count')
df_test['count'] = df_test.groupby('hash')['hash'].transform('count')

In [None]:
train.shape

In [None]:
len(df_train.groupby(['hash']).agg('count')['trajectory_id'].values)

In [8]:
start_time = '0 days 15:00:00'
end_time = '0 days 16:00:00'
# df_train = pd.read_pickle('df_train')
# df_test = pd.read_pickle('df_test')
train = pd.read_pickle('train')
test = pd.read_pickle('test')

# df_train['hour'] = df_train['time_entry'] / np.timedelta64(1, 'h')
# df_test['hour'] = df_test['time_entry'] / np.timedelta64(1, 'h')

# test['count'] = df_test.loc[df_test['hash'].isin(test.hash.values)].groupby(['hash']).agg('count')['trajectory_id'].values
# train['count'] = df_train.loc[df_train['hash'].isin(train.hash.values)].groupby(['hash']).agg('count')['trajectory_id'].values

all_test = df_test.loc[df_test['hash'].isin(test.hash.values)]
all_train = df_train.loc[df_train['hash'].isin(train.hash.values)]
test_start_not_in_city = test[~(((test['x_entry'] >= 3750901.5068) &
         (test['x_entry'] <= 3770901.5068)) & ((test['y_entry'] >= -19268905.6133) &
         (test['y_entry'] <= -19208905.6133)))]
test_start_in_city = test[((test['x_entry'] >= 3750901.5068) &
         (test['x_entry'] <= 3770901.5068)) & ((test['y_entry'] >= -19268905.6133) &
         (test['y_entry'] <= -19208905.6133))]
train_not_in_city_1 = train[(~((train['x_entry'] >= 3750901.5068) &
         (train['x_entry'] <= 3770901.5068) & (train['y_entry'] >= -19268905.6133) &
         (train['y_entry'] <= -19208905.6133))&(train['label'] == 1))]
train_not_in_city_0 = train[(~((train['x_entry'] >= 3750901.5068) &
         (train['x_entry'] <= 3770901.5068) & (train['y_entry'] >= -19268905.6133) &
         (train['y_entry'] <= -19208905.6133))&(train['label'] == 0))]
train_in_city_0 = train[(((train['x_entry'] >= 3750901.5068) &
         (train['x_entry'] <= 3770901.5068) & (train['y_entry'] >= -19268905.6133) &
         (train['y_entry'] <= -19208905.6133))&(train['label'] == 0))]
train_in_city_1 = train[(((train['x_entry'] >= 3750901.5068) &
         (train['x_entry'] <= 3770901.5068) & (train['y_entry'] >= -19268905.6133) &
         (train['y_entry'] <= -19208905.6133))&(train['label'] == 1))]

In [None]:
tt = df_train[(df_train['x_exit'] >= 3750901.5068) & (df_train['x_exit'] <= 3770901.5068) & 
              (df_train['y_exit'] >= -19268905.6133) & (df_train['y_exit'] <= -19208905.6133)]
tt2 = df_test[(df_test['x_exit'] >= 3750901.5068) & (df_test['x_exit'] <= 3770901.5068) & 
              (df_test['y_exit'] >= -19268905.6133) & (df_test['y_exit'] <= -19208905.6133)]

In [None]:
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==1).groupby('hash')))
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==2).groupby('hash')))
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==3).groupby('hash')))
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==4).groupby('hash')))
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==5).groupby('hash')))

In [None]:
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==6).groupby('hash')))
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==7).groupby('hash')))
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==8).groupby('hash')))
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==9).groupby('hash')))
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==10).groupby('hash')))
print(len(all_test.groupby(['hash']).filter(lambda x: len(x)==11).groupby('hash')))

In [None]:
# plt.clf()
# (all_test.groupby(['hash']).agg('count')['trajectory_id'].hist(bins = 20))

# Plottings

In [None]:
def plot_trajectory(df_sample):
    rect = patches.Rectangle((3750901.5068,-19208905.6133), (3770901.5068 - 3750901.5068),
                             (- 19268905.6133 + 19208905.6133),
                             linewidth=1,edgecolor='g',fill = False,hatch = '\\\\\\', label = 'city center')
    city = plt.gca().add_patch(rect)
    for index, row in df_sample.iterrows():
#         lines = plt.plot(df_sample.x_exit,df_sample.y_exit, label='Lines', color = 'g')
        arrows = plt.arrow(row['x_entry'], row['y_entry'], row['x_exit'] - row['x_entry'], row['y_exit'] - row['y_entry'],
                  label ='Trajectory', color = 'b')
    entries = plt.scatter(df_sample.x_entry,df_sample.y_entry, label='Entry point', color = 'b')
    exits = plt.scatter(df_sample.x_exit,df_sample.y_exit, label='Exit point', color = 'r')
#     lines = plt.plot(df_sample.x_exit,df_sample.y_exit, label='Lines', color = 'g')
    plt.legend(handles=[city, arrows])
    plt.grid(True)
    plt.show()

In [None]:
def plot_fig(train):
    fig, ax = plt.subplots(2,1)
#     rect = patches.Rectangle((3750901.5068,-19208905.6133), (3770901.5068 - 3750901.5068),
#                                  (- 19268905.6133 + 19208905.6133),
#                                  linewidth=1,edgecolor='g',fill = False,hatch = '\\\\\\', label = 'city center')
    ax[0].scatter(train.x_entry,train.y_entry, label='Entry point', color = 'b', s = 0.05)
    ax[0].axis('off')
    ax[1].scatter(train.x_exit,train.y_exit, label='Exit point', color = 'r', s = 0.05)
    ax[1].axis('off')
#     ax[0].gca().add_patch(rect)
#     ax[1].gca().add_patch(rect)

In [None]:
def plot_fig_entry(train):
#     cmap = sns.cubehelix_palette(8, start=.5, rot=-.75, as_cmap=True)
    rect = patches.Rectangle((3750901.5068,-19208905.6133), (3770901.5068 - 3750901.5068),
                                 (- 19268905.6133 + 19208905.6133),
                                 linewidth=1,edgecolor='g',fill = False, label = 'city center')
    entries = plt.scatter(train.x_entry,train.y_entry, label='Entry point', c = 'b', s=0.05)
#     exits = plt.scatter(train.x_exit,train.y_exit, label='Exit point', c = train.velocity, s = 0.05, cmap=cmap)
#     for index, row in train.iterrows():
#         arrows = plt.arrow(row['x_entry'], row['y_entry'], row['x_exit'] - row['x_entry'], row['y_exit'] - row['y_entry'],
#                   label ='Not in City', color = 'b', linewidth = 0.01)
    city = plt.gca().add_patch(rect)
    plt.legend(handles = [city])
#     plt.colorbar(entries)
    plt.grid(True)
#     plt.savefig('foo.png', dpi=1200)
    plt.show()

In [None]:
def plot_fig_exits(train):
    cmap = sns.cubehelix_palette(8, start=.5, rot=-.75, as_cmap=True)
    rect = patches.Rectangle((3750901.5068,-19208905.6133), (3770901.5068 - 3750901.5068),
                                 (- 19268905.6133 + 19208905.6133),
                                 linewidth=1,edgecolor='g',fill = False, label = 'city center')
#     entries = plt.scatter(train.x_entry,train.y_entry, label='Entry point', c = train.velocity, s=0.05, cmap=cmap)
    exits = plt.scatter(train.x_exit,train.y_exit, label='Exit point', c = 'r', s = 0.05)
#     for index, row in train.iterrows():
#         arrows = plt.arrow(row['x_entry'], row['y_entry'], row['x_exit'] - row['x_entry'], row['y_exit'] - row['y_entry'],
#                   label ='Not in City', color = 'b', linewidth = 0.01)
    city = plt.gca().add_patch(rect)
    plt.legend(handles = [city])
    plt.colorbar(entries)
    plt.grid(True)
#     plt.savefig('foo.png', dpi=1200)
    plt.show()

In [None]:
def plot_test_fig(sample):
    rect = patches.Rectangle((3750901.5068,-19208905.6133), (3770901.5068 - 3750901.5068),
                                 (- 19268905.6133 + 19208905.6133),
                                 linewidth=1,edgecolor='g',fill = False,hatch = '\\\\\\', label = 'city center')
    entries = plt.scatter(sample.x_entry,sample.y_entry, label='Entry point', color = 'b', s = 0.002)
    city = plt.gca().add_patch(rect)
    plt.legend(handles = [city,entries])
    plt.title("All test starting point in map")
    plt.grid(True)
    plt.savefig('foo1.png', dpi=1200)
    plt.show()

In [None]:
plot_trajectory(df_list[100][1])

In [None]:
def plot_point(row):
    rect = patches.Rectangle((3750901.5068,-19208905.6133), (3770901.5068 - 3750901.5068),
                             (- 19268905.6133 + 19208905.6133),
                             linewidth=1,edgecolor='g',fill = False,hatch = '\\\\\\', label = 'city center')
    city = plt.gca().add_patch(rect)
    entries = plt.scatter(row['x_entry'],row['y_entry'], label='Entry point', color = 'b')
    plt.legend(handles=[city, entries])
    plt.grid(True)
    plt.show()

In [None]:
def plot_row(row):
    rect = patches.Rectangle((3750901.5068,-19208905.6133), (3770901.5068 - 3750901.5068),
                             (- 19268905.6133 + 19208905.6133),
                             linewidth=1,edgecolor='g',fill = False, label = 'city center')
    city = plt.gca().add_patch(rect)
#     arrows = plt.plot([row['x_entry'], row['x_exit']], [row['y_entry'], row['y_exit']], 
#                       label ='Trajectory', color = 'b')
    entries = plt.scatter(row['x_entry'],row['y_entry'], label='Entry point', color = 'b')
    exits = plt.scatter(row['x_exit'],row['y_exit'], label='Exit point', color = 'r')
    
    point = Point(row['x_entry'], row['y_entry'])
    pol_ext = LinearRing(poly.exterior.coords)
    d = pol_ext.project(point)
    p = pol_ext.interpolate(d)
    closest_point_coords = list(p.coords)[0]
    print(p.distance(point), poly.boundary.distance(point), p)
    
    cityClose = plt.scatter(closest_point_coords[0],closest_point_coords[1], label='closest boundary', color = 'y')
#     plt.legend(handles=[city,entries, exits, cityClose])
    plt.grid(True)
    plt.show()

# Test Explore

In [None]:
df_test[(df_test.x_exit.isnull()) & (df_test.y_exit.isnull())].duration.describe()

In [None]:
df_test[(df_test['duration'] > 0) & (df_test['duration'] < 5000) & (df_test.x_exit.isnull()) & (df_test.y_exit.isnull())].duration.describe()

In [None]:
plt.scatter(df_train[df_train['velocity'] != 0.0].distance, df_train[df_train['velocity'] != 0.0].duration)

In [None]:
df_test[df_test.x_exit.isnull()]['label'].value_counts()

In [None]:
df_test['label'] = df_test['label'].replace(-1,0)

In [None]:
df_test[df_test.x_exit.isnull()][['trajectory_id', 'label']].to_csv('output.csv', index=False)

# Train explore

In [None]:
df_train[(df_train['duration'] == 0.0) & ((df_train['x_entry'] != df_train['x_exit']) |
         (df_train['y_entry'] != df_train['y_exit']))]

In [None]:
df_train[(df_train['duration'] == 0.0) & (df_train['x_entry'] == df_train['x_exit']) &
         (df_train['y_entry'] == df_train['y_exit'])].shape[0]

In [None]:
df_train.label.value_counts()

In [None]:
df_train[(df_train['duration'] == 0.0) & (df_train['x_exit'] >= 3750901.5068) & 
         (df_train['x_exit'] <= 3770901.5068) & (df_train['y_exit'] >= -19268905.6133) & 
         (df_train['y_exit'] <= -19208905.6133)]

# Explorations

In [None]:
ax = sns.scatterplot(x="duration", y="city_distance", hue = 'label', data=dd[(dd['duration']!=0)&(dd['label'] == 1)])

In [None]:
plt.clf()
plot_trajectory(dd_1)

In [None]:
plt.clf()
plot_fig(train[train['label'] == 0])

In [None]:
plt.clf()
plot_test_fig(df_test[(df_test['duration'] > 0) & (df_test.x_exit.isnull()) & (df_test.y_exit.isnull())])

In [None]:
plt.scatter(df_train[df_train['duration']!= 0.0].duration, df_train[df_train['duration']!= 0.0].distance,
            label='Entry point', color = 'b', s= 0.01)

In [None]:
plt.clf()
df_train[(df_train['velocity'] != 0.0)&(df_train['velocity'] < 100.0)].velocity.hist()

In [None]:
plt.scatter(test.city_distance, test.duration, color = 'b', s = 0.01)

In [None]:
plt.clf()
plt.scatter(train[train['label'] == 0].city_distance, train[train['label'] == 0].duration, color = 'r', s = 0.0001)
plt.scatter(train[train['label'] == 1].city_distance, train[train['label'] == 1].duration, color = 'b', s = 0.0001)

In [None]:
ax = sns.scatterplot(x="city_distance", y="duration", hue = 'label', data=train, s=5)

In [None]:
print(train_in_city_1.shape[0])
plot_fig(train_not_in_city_0[train_not_in_city_0['city_distance']<16961.917751])

In [None]:
train_not_in_city_0[train_not_in_city_0['city_distance']<16961.917751]

In [None]:
plt.clf()
plot_trajectory(df_train[df_train['hash'] == '0000cf177130469eeac79f67b6bcf3df_9'])

# Write to file

In [None]:
# df_test['label']= -1
# df_test['label'] = df_test.apply(lambda row: 1 
#                                  if ((row['duration'] == 0.0)&(math.isnan(row['x_exit'])) & (math.isnan(row['y_exit'])) &
#                                       (row['x_entry'] >= 3750901.5068) & (row['x_entry'] <= 3770901.5068) & 
#                                       (row['y_entry'] >= -19268905.6133) & (row['y_entry'] <= -19208905.6133)) 
#                                  else row['label'], axis=1)
# df_test['label'] = df_test.apply(lambda row: 0 
#                                  if ((row['duration'] == 0.0)&(math.isnan(row['x_exit'])) & (math.isnan(row['y_exit'])) &
#                                      (row['label'] == -1))
#                                  else row['label'], axis=1)

df_test['label'] = df_test.apply(lambda row: 0 
                                 if ((row['duration'] != 0.0)&(math.isnan(row['x_exit'])) &
                                     (math.isnan(row['y_exit'])) &
                                     (row['label'] == -1) &
                                     ~(((row['x_entry'] >= 3750901.5068) &
                                                               (row['x_entry'] <= 3770901.5068)) &
                                       ((row['y_entry'] >= -19268905.6133) &
                                        (row['y_entry'] <= -19208905.6133))))
                                 else row['label'], axis=1)
df_test['label'] = df_test.apply(lambda row: 1
                                 if ((row['duration'] != 0.0)&(math.isnan(row['x_exit'])) &
                                     (math.isnan(row['y_exit'])) &
                                     (row['label'] == -1) &
                                     (((row['x_entry'] >= 3750901.5068) &
                                                               (row['x_entry'] <= 3770901.5068)) &
                                       ((row['y_entry'] >= -19268905.6133) &
                                        (row['y_entry'] <= -19208905.6133))))
                                 else row['label'], axis=1)

In [None]:
df_test[(df_test.x_exit.isnull())&(df_test.y_exit.isnull())].shape

In [None]:
df_test[(df_test.x_exit.isnull())&(df_test.y_exit.isnull())][['trajectory_id', 'label']].to_csv('output3.csv', index=False)

In [None]:
plt.clf()
plot_fig(train[(~((train['x_entry'] >= 3750901.5068) &
         (train['x_entry'] <= 3770901.5068) & (train['y_entry'] >= -19268905.6133) &
         (train['y_entry'] <= -19208905.6133))&(train['label'] == 1))])

# Clustering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from collections import Counter

In [None]:
X=train_in_city_1[train_in_city_1['duration'] != 0.0].loc[:,['x_exit','y_exit']]

In [None]:
id_n=2
kmeans = KMeans(n_clusters=id_n, random_state=0).fit(X)
id_label=kmeans.labels_

In [None]:
ptsymb = np.array(['b.','r.','m.','g.','c.','k.','b*','r*','m*','r^']);
plt.figure(figsize=(12,12))
plt.ylabel('Longitude', fontsize=12)
plt.xlabel('Latitude', fontsize=12)
for i in range(id_n):
    cluster=np.where(id_label==i)[0]
    plt.plot(X.x_exit[cluster].values,X.y_exit[cluster].values,ptsymb[i])
plt.show()

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.svm import SVC

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
dbscan = DBSCAN(eps=10000.0, min_samples = 10)
clusters = dbscan.fit_predict(X_scaled)

In [None]:
# plot the cluster assignments
plt.scatter(X['x_exit'], X['y_exit'], c=clusters, cmap="plasma", s =0.01)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
# plt.colorbar(clusters)

# Classification

In [120]:
print(train_in_city_0.shape)
print(train_in_city_1.shape)
print(train_not_in_city_0.shape)
print(train_not_in_city_1.shape)

(23902, 20)
(83989, 20)
(243258, 20)
(30036, 20)


In [178]:
train_data_in_city = pd.concat([train_in_city_0[['x_entry','y_entry','duration','city_distance','count','hour','label']],
                        train_in_city_1[['x_entry','y_entry','duration','city_distance','count','hour','label']]])
train_data_not_in_city = pd.concat([train_not_in_city_0[['x_entry','y_entry','duration','city_distance','count','hour','label']],
                        train_not_in_city_1[['x_entry','y_entry','duration','city_distance','count','hour','label']]])


# train_data_in_city = pd.concat([train_in_city_0[['x_entry','y_entry','duration','city_distance','count','hour','label']],
#                         train_in_city_1[['x_entry','y_entry','duration','city_distance','count','hour','label']]])
# train_data_not_in_city = pd.concat([train_not_in_city_0[['x_entry','y_entry','duration','city_distance','label']],
#                         train_not_in_city_1[['x_entry','y_entry','duration','city_distance','label']]])

# train_data_in_city = pd.concat([train_in_city_0[['x_entry','y_entry','duration','city_distance','count','hour','label']],
#                         train_in_city_1[['x_entry','y_entry','duration','city_distance','count','hour','label']].sample(n=30036, random_state=1)])
# # train_data_not_in_city = pd.concat([train_not_in_city_0[['x_entry','y_entry','duration','city_distance','count','label']].sample(n=3957, random_state=1),
#                         train_not_in_city_1[['x_entry','y_entry','duration','city_distance','count','label']]])

In [141]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

In [179]:
x_1 = train_data_in_city.iloc[:, :-1].values
y_1 = train_data_in_city.iloc[:, -1].values
x_2 = train_data_not_in_city.iloc[:, :-1].values
y_2 = train_data_not_in_city.iloc[:, -1].values
# sc_1 = StandardScaler()  
# sc_2 = StandardScaler()  
# x_1 = sc_1.fit_transform(x_1)
# x_1 = sc_2.fit_transform(x_1)  

In [180]:
xTrain, xTest, yTrain, yTest = train_test_split(x_1, y_1, test_size = 0.1, random_state = 0)
# clf_1 = GaussianNB()
clf_1 = RandomForestClassifier(n_jobs = -1, n_estimators=100,max_depth = 40, random_state=42)
# clf_1 = AdaBoostClassifier(n_estimators=1000)
# clf_1 = SVC(gamma=2, C=1)
# clf_1 = MLPClassifier(activation= 'relu',hidden_layer_sizes=(3,10,10))
# clf_1 = GaussianProcessClassifier(1.0 * RBF(1.0))
# clf_1 = LogisticRegression(penalty ='l2',dual = True)
# clf_1 = DecisionTreeClassifier(max_depth=40)
# clf_1 = KNeighborsClassifier(10)
clf_1.fit(xTrain, yTrain)
yPred = clf_1.predict(xTest)
print(classification_report(yTest, yPred))

              precision    recall  f1-score   support

           0       0.57      0.37      0.45      2433
           1       0.83      0.92      0.87      8357

   micro avg       0.80      0.80      0.80     10790
   macro avg       0.70      0.64      0.66     10790
weighted avg       0.77      0.80      0.78     10790



In [181]:
for feature in zip(yTest, clf_1.feature_importances_):
    print(feature)

(0, 0.15561204377185395)
(1, 0.1594745770249206)
(1, 0.2926006148093666)
(1, 0.15750693553807016)
(1, 0.07886886105198077)
(1, 0.15593696780380786)


In [157]:
xTrain, xTest, yTrain, yTest = train_test_split(x_2, y_2, test_size = 0.1, random_state = 0)
# clf_2 = GaussianNB()
# clf_2 = RandomForestClassifier(n_jobs = -1, n_estimators=1000,max_depth = 40)
# clf_2 = AdaBoostClassifier(n_estimators=1000)
# clf_2 = MLPClassifier(alpha=1)
# clf_2 = GaussianProcessClassifier(1.0 * RBF(1.0))
# clf_2 = SVC(gamma=2, C=1)
# clf_2 = LogisticRegression()
clf_2 = DecisionTreeClassifier(max_depth=60, random_state=42)
# clf_2 = KNeighborsClassifier(7)
# clf_2 = MLPClassifier(activation= 'relu',hidden_layer_sizes=(3,10,10))
clf_2.fit(xTrain, yTrain)
yPred = clf_2.predict(xTest)
print(classification_report(yTest, yPred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91     24285
           1       0.27      0.28      0.28      3045

   micro avg       0.84      0.84      0.84     27330
   macro avg       0.59      0.59      0.59     27330
weighted avg       0.84      0.84      0.84     27330



In [158]:
for feature in zip(yTest, clf_2.feature_importances_):
    print(feature)

(0, 0.21993258373823257)
(0, 0.133661866665599)
(0, 0.22946301700164978)
(0, 0.13997277699415314)
(0, 0.08850666805591205)
(0, 0.18846308754445354)


In [182]:
# test_start_in_city['label'] =  clf_1.predict(sc_1.fit_transform(test_start_in_city[['x_entry','y_entry','duration','city_distance','count','hour']].values))
# test_start_not_in_city['label'] =  clf_2.predict(sc_2.fit_transform(test_start_not_in_city[['x_entry','y_entry','duration','city_distance','count','hour']].values))

test_start_in_city['label'] =  clf_1.predict(test_start_in_city[['x_entry','y_entry','duration','city_distance','count','hour']].values)
test_start_not_in_city['label'] =  clf_2.predict(test_start_not_in_city[['x_entry','y_entry','duration','city_distance','count','hour']].values)


# test_start_in_city['label'] =  clf_1.predict(sc_1.fit_transform(test_start_in_city[['x_entry','y_entry','duration','city_distance','count','hour']].values))
# test_start_not_in_city['label'] =  clf_2.predict(sc_2.fit_transform(test_start_not_in_city[['x_entry','y_entry','duration','city_distance']].values))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [99]:
test_start_not_in_city['label'] = test_start_not_in_city.apply(lambda row: 1 
                                 if (row['city_distance']<1000)
                                 else row['label'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [183]:
out = pd.concat([df_test[df_test['label']!=-1],test_start_in_city,
                 test_start_not_in_city])

In [175]:
train.label.value_counts()[1]/train.label.value_counts()[0]

0.4268041622997455

In [184]:
out.label.value_counts()[1]/out.label.value_counts()[0]

0.3883595691797846

In [185]:
test_start_in_city.label.value_counts()

1    3319
0    1364
Name: label, dtype: int64

In [186]:
test_start_not_in_city.label.value_counts()

0    10796
1     1195
Name: label, dtype: int64

In [187]:
test_start_not_in_city.label.value_counts()[1]/test_start_not_in_city.label.value_counts()[0]

0.11068914412745462

In [188]:
test_start_in_city.label.value_counts()[1]/test_start_in_city.label.value_counts()[0]

2.4332844574780057

In [189]:
train_not_in_city_1.label.value_counts()[1]/train_not_in_city_0.label.value_counts()[0]

0.12347384258688306

In [190]:
train_in_city_1.label.value_counts()[1]/train_in_city_0.label.value_counts()[0]

3.513890051041754

In [191]:
out.label.value_counts()

0    24140
1     9375
Name: label, dtype: int64

In [192]:
out.sort_values('trajectory_id',ascending=True)[['trajectory_id','label']].to_csv('output20.csv', index=False)

# Last

In [None]:
test_start_in_city.duration.describe()

In [None]:
plot_fig_entry(test_start_in_city[test_start_in_city['duration']>500])

In [None]:
test_start_in_city[test_start_in_city['duration']<500].shape