In [157]:
import numpy as np
import pandas as pd

In [158]:
df = pd.read_csv('Lekagul Sensor Data.csv', sep = ',')

In [159]:
df

Unnamed: 0,Timestamp,car-id,car-type,gate-name
0,2015-05-01 00:43:28,20154301124328-262,4,entrance3
1,2015-05-01 01:03:48,20154301124328-262,4,general-gate1
2,2015-05-01 01:06:24,20154301124328-262,4,ranger-stop2
3,2015-05-01 01:09:25,20154301124328-262,4,ranger-stop0
4,2015-05-01 01:12:36,20154301124328-262,4,general-gate2
...,...,...,...,...
171472,2016-05-31 23:40:13,20161031111001-854,6,ranger-stop2
171473,2016-05-31 23:42:08,20165831105856-579,1,general-gate4
171474,2016-05-31 23:43:13,20161031111001-854,6,general-gate1
171475,2016-05-31 23:49:45,20165831105856-579,1,general-gate7


In [160]:
df["gate-name"].replace(r'(entrance\d*)' , 'entrance', regex=True, inplace=True)
df["gate-name"].replace(r'(general-gate\d*)' , 'general gate', regex=True, inplace=True)
df["gate-name"].replace(r'(ranger-stop\d*)' , 'ranger-stop', regex=True, inplace=True)
df["gate-name"].replace(r'(camping\d*)' , 'camping', regex=True, inplace=True)
df["gate-name"].replace(r'(gate\d*)' , 'gate', regex=True, inplace=True)



In [161]:
df = df[['car-id', 'gate-name', 'Timestamp', 'car-type']]
df

Unnamed: 0,car-id,gate-name,Timestamp,car-type
0,20154301124328-262,entrance,2015-05-01 00:43:28,4
1,20154301124328-262,general gate,2015-05-01 01:03:48,4
2,20154301124328-262,ranger-stop,2015-05-01 01:06:24,4
3,20154301124328-262,ranger-stop,2015-05-01 01:09:25,4
4,20154301124328-262,general gate,2015-05-01 01:12:36,4
...,...,...,...,...
171472,20161031111001-854,ranger-stop,2016-05-31 23:40:13,6
171473,20165831105856-579,general gate,2016-05-31 23:42:08,1
171474,20161031111001-854,general gate,2016-05-31 23:43:13,6
171475,20165831105856-579,general gate,2016-05-31 23:49:45,1


In [162]:
print(df["gate-name"].unique())
print(df["car-type"].unique())
print(df["car-id"].nunique())


['entrance' 'general gate' 'ranger-stop' 'camping' 'ranger-base' 'gate']
['4' '1' '3' '5' '2' '2P' '6']
18708


# insight1

Approximately 3200 cars (17%) are “pass-throughs” – they drive through
the Preserve without stopping at campsites or ranger-related places 
along the way.


In [163]:
all_cars = df["car-id"].unique()
non_pass_through = df.loc[df['gate-name'].isin(["ranger-stop", "ranger-base", "camping"]), 'car-id'].unique()
pass_through =  np.array(list(set(all_cars) - set(non_pass_through)))

#new df
df_insight1 = df.loc[df['car-id'].isin(pass_through)]

#sanity-check
df_insight1["car-id"].nunique()



3192

In [164]:
#sample
pass_through_sample = np.random.choice(pass_through, 170, replace = False)
print(len(pass_through_sample))
#new df
df_insight1_sample = df.loc[df['car-id'].isin(pass_through_sample)]

170


# insight2

A majority of the traffic (around 68.5%) passed by at least one ranger stop, and they may be general traffic or ranger cars.
Approximately 1000 (5.5%) cars are ranger cars who left their bases to various gates that were not accessible to general traffic.


In [165]:
all_cars = df["car-id"].unique()
taken_ranger_stops = df.loc[df['gate-name'].isin(["ranger-stop"]), 'car-id'].unique()
print(len(taken_ranger_stops))
#new df
df_insight2 = df.loc[df['car-id'].isin(taken_ranger_stops)]


12808


In [166]:
#sample
taken_ranger_stops_sample = np.random.choice(taken_ranger_stops, 685, replace = False)
print(len(taken_ranger_stops_sample))
#new df
df_insight2_sample = df.loc[df['car-id'].isin(taken_ranger_stops_sample)]


685


In [167]:
all_cars = df["car-id"].unique()
crossed_gate = df.loc[df['gate-name'].isin(["gate"]), 'car-id'].unique()
print(len(crossed_gate))
#new df
df_insight2a = df.loc[df['car-id'].isin(crossed_gate)]

1021


In [168]:
#sample
crossed_gate_sample = np.random.choice(crossed_gate, 55, replace = False)
print(len(crossed_gate_sample))
#new df
df_insight2a_sample = df.loc[df['car-id'].isin(crossed_gate_sample)]


55


# insight3

Approximately 2700 (14.4%) cars are definitely campers, who entered the preserve through the entrance and went to various camp sites without stopping at any ranger stops or ranger bases.

In [169]:
# Get sequence where both entrance and camping is present
potential_camper_entrance = df.loc[df['gate-name'].isin(["entrance"]), 'car-id'].unique()
potential_camper_camping = df.loc[df['gate-name'].isin(["camping"]), 'car-id'].unique()
potential_camper = np.intersect1d(potential_camper_entrance, potential_camper_camping)
print(len(potential_camper))

# Get sequence where one of ranger-stop and ranger-base is present
stopper = df.loc[df['gate-name'].isin(["ranger-stop", "ranger-base"]), 'car-id'].unique()
print(len(stopper))
camper =  list(set(potential_camper) - set(stopper))
print(len(camper))

df_insight3 = df.loc[df['car-id'].isin(camper)]

9660
12827
2689


In [170]:
#sample
camper_sample = np.random.choice(camper, 144, replace = False)
print(len(camper_sample))
#new df
df_insight3_sample = df.loc[df['car-id'].isin(camper_sample)]


144


In [171]:
insight_df = pd.concat([df_insight1_sample, df_insight2_sample, df_insight2a_sample, df_insight3_sample], ignore_index=True)
insight_df.drop_duplicates(ignore_index=True, inplace=True)
insight_df.info()
len(insight_df)
print(insight_df)
#sanity-check
insight_df["car-id"].nunique()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10468 entries, 0 to 10467
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   car-id     10468 non-null  object
 1   gate-name  10468 non-null  object
 2   Timestamp  10468 non-null  object
 3   car-type   10468 non-null  object
dtypes: object(4)
memory usage: 327.2+ KB
                   car-id     gate-name            Timestamp car-type
0      20150802110852-346      entrance  2015-05-02 23:08:52        2
1      20150802110852-346  general gate  2015-05-02 23:22:45        2
2      20150802110852-346  general gate  2015-05-02 23:30:33        2
3      20150802110852-346      entrance  2015-05-02 23:45:01        2
4      20150903020928-497      entrance  2015-05-03 02:09:28        2
...                   ...           ...                  ...      ...
10463  20164228094248-123      entrance  2016-05-28 10:21:48        2
10464  20160730070750-227      entrance  2016-05-30 07:0

1051

In [179]:
insight_df.to_csv('vast-eventflow-shortened.txt',index=False,header=False,sep="\t")

In [180]:
insight_df.to_csv('vast-eventflow-shortened.csv',index=False,sep=",")

# Other codes

In [106]:
potential_camper = df.loc[df['gate-name'].isin(["entrance", "camping"]), 'car-id'].unique()
stopper = df.loc[df['gate-name'].isin(["ranger-stop", "ranger-base"]), 'car-id'].unique()

print(len(potential_camper))
print(len(stopper))
camper =  list(set(potential_camper) - set(stopper))#np.intersect1d(potential_camper, stopper)
print(len(camper))

18333
12827
5881


In [38]:
df.loc[~df['gate-name'].isin(["ranger-stop", "ranger-base"])]

Unnamed: 0,car-id,gate-name,Timestamp,car-type
0,20154301124328-262,entrance,2015-05-01 00:43:28,4
1,20154301124328-262,general gate,2015-05-01 01:03:48,4
4,20154301124328-262,general gate,2015-05-01 01:12:36,4
5,20154301124328-262,general gate,2015-05-01 01:24:02,4
6,20153101013141-937,entrance,2015-05-01 01:31:41,1
...,...,...,...,...
171470,20161031111001-854,general gate,2016-05-31 23:33:00,6
171473,20165831105856-579,general gate,2016-05-31 23:42:08,1
171474,20161031111001-854,general gate,2016-05-31 23:43:13,6
171475,20165831105856-579,general gate,2016-05-31 23:49:45,1


In [177]:
len(np.intersect1d(crossed_gate, taken_ranger_stops))

1002

In [None]:
df["gate-name"].isin()

In [34]:
df[df.["gate-name"].str.contains('oo', regex= True, na=False)]

AttributeError: 'DataFrame' object has no attribute 'gate'

In [12]:
df.to_csv('cleaned-eventflow.txt',index=False,header=False,sep="\t")