In [2]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from functools import partial
from tqdm import tqdm

In [3]:
df = pd.read_csv("data/DocklessTripOpenData_9.csv")
df.head()

Unnamed: 0,TripID,StartDate,StartTime,EndDate,EndTime,TripDuration,TripDistance,StartLatitude,StartLongitude,EndLatitude,EndLongitude,DayOfWeek,HourNum
0,0000045c-2677-3a7d-4b73-cad99a57,2019-06-26,19:30,2019-06-26,19:30,3.0,0.0,38.253,-85.756,38.253,-85.755,4,19
1,0000487b-92e6-50d6-7569-42ed3818,2019-09-22,14:30,2019-09-22,14:30,5.0,0.0,38.207,-85.747,38.206,-85.748,1,14
2,00006088-2579-e0d0-6a30-a15bb878,2019-08-21,17:30,2019-08-21,17:30,6.0,0.33,38.264,-85.728,38.261,-85.73,4,17
3,00008c1a-899b-8596-970f-9f6bf495,2019-07-03,11:00,2019-07-03,11:15,6.0,0.64,38.217,-85.757,38.221,-85.763,4,11
4,000096c8-bac3-4d6f-7ebb-30b957ee,2019-05-04,21:15,2019-05-04,21:30,7.0,0.684,38.221,-85.762,38.223,-85.764,7,21


In [3]:
df.shape

(434582, 13)

In [4]:
df[["StartLatitude", "StartLongitude"]].describe()

Unnamed: 0,StartLatitude,StartLongitude
count,434582.0,434582.0
mean,38.241514,-85.746324
std,0.036914,0.088154
min,25.775,-122.657
25%,38.224,-85.758
50%,38.251,-85.752
75%,38.256,-85.74
max,45.573,-73.969


In [4]:
def time_merge(row):
    started_at = f"{row['StartDate']} {row['StartTime']}:00"
    ended_at = f"{row['EndDate']} {row['EndTime']}:00"

    new_row = pd.Series({
        "started_at": started_at,
        "ended_at": ended_at,
    })

    return new_row

In [5]:
subdf = df.apply(time_merge, axis=1)
subdf.head()

Unnamed: 0,started_at,ended_at
0,2019-06-26 19:30:00,2019-06-26 19:30:00
1,2019-09-22 14:30:00,2019-09-22 14:30:00
2,2019-08-21 17:30:00,2019-08-21 17:30:00
3,2019-07-03 11:00:00,2019-07-03 11:15:00
4,2019-05-04 21:15:00,2019-05-04 21:30:00


In [6]:
df = pd.concat([df, subdf], axis=1)
df.head()

Unnamed: 0,TripID,StartDate,StartTime,EndDate,EndTime,TripDuration,TripDistance,StartLatitude,StartLongitude,EndLatitude,EndLongitude,DayOfWeek,HourNum,started_at,ended_at
0,0000045c-2677-3a7d-4b73-cad99a57,2019-06-26,19:30,2019-06-26,19:30,3.0,0.0,38.253,-85.756,38.253,-85.755,4,19,2019-06-26 19:30:00,2019-06-26 19:30:00
1,0000487b-92e6-50d6-7569-42ed3818,2019-09-22,14:30,2019-09-22,14:30,5.0,0.0,38.207,-85.747,38.206,-85.748,1,14,2019-09-22 14:30:00,2019-09-22 14:30:00
2,00006088-2579-e0d0-6a30-a15bb878,2019-08-21,17:30,2019-08-21,17:30,6.0,0.33,38.264,-85.728,38.261,-85.73,4,17,2019-08-21 17:30:00,2019-08-21 17:30:00
3,00008c1a-899b-8596-970f-9f6bf495,2019-07-03,11:00,2019-07-03,11:15,6.0,0.64,38.217,-85.757,38.221,-85.763,4,11,2019-07-03 11:00:00,2019-07-03 11:15:00
4,000096c8-bac3-4d6f-7ebb-30b957ee,2019-05-04,21:15,2019-05-04,21:30,7.0,0.684,38.221,-85.762,38.223,-85.764,7,21,2019-05-04 21:15:00,2019-05-04 21:30:00


In [7]:
df = df.dropna()

In [9]:
df["started_at"].min(), df["started_at"].max(), df["ended_at"].min(), df["ended_at"].max()

('2018-08-09 10:30:00',
 '2019-10-31 21:15:00',
 '2018-08-09 10:30:00',
 '2019-10-31 21:30:00')

In [10]:
def fix_time(row, time_col):
    dayofmonth = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    
    date, time = row[time_col].split(" ")
    year, month, day = date.split("-")
    year, month, day = int(year), int(month), int(day)
    hour, minute, second = time.split(":")
    hour, minute, second = int(hour), int(minute), int(second)

    if minute >= 60:
        hour += 1
        minute -= 60
    if hour >= 24:
        day += 1
        hour -= 24
    if day >= dayofmonth[month - 1] + 1:
        month += 1
        day -= dayofmonth[month - 2]
    if month >= 13:
        year += 1
        month -= 12

    t = datetime.datetime(year=year, month=month, day=day, hour=hour, minute=minute, second=second)
    return t.strftime("%Y-%m-%d %H:%M:%S")

In [11]:
df["started_at2"] = df.apply(partial(fix_time, time_col="started_at"), axis=1)
df["ended_at2"] = df.apply(partial(fix_time, time_col="ended_at"), axis=1)
df.head()

Unnamed: 0,TripID,StartDate,StartTime,EndDate,EndTime,TripDuration,TripDistance,StartLatitude,StartLongitude,EndLatitude,EndLongitude,DayOfWeek,HourNum,started_at,ended_at,started_at2,ended_at2
0,0000045c-2677-3a7d-4b73-cad99a57,2019-06-26,19:30,2019-06-26,19:30,3.0,0.0,38.253,-85.756,38.253,-85.755,4,19,2019-06-26 19:30:00,2019-06-26 19:30:00,2019-06-26 19:30:00,2019-06-26 19:30:00
1,0000487b-92e6-50d6-7569-42ed3818,2019-09-22,14:30,2019-09-22,14:30,5.0,0.0,38.207,-85.747,38.206,-85.748,1,14,2019-09-22 14:30:00,2019-09-22 14:30:00,2019-09-22 14:30:00,2019-09-22 14:30:00
2,00006088-2579-e0d0-6a30-a15bb878,2019-08-21,17:30,2019-08-21,17:30,6.0,0.33,38.264,-85.728,38.261,-85.73,4,17,2019-08-21 17:30:00,2019-08-21 17:30:00,2019-08-21 17:30:00,2019-08-21 17:30:00
3,00008c1a-899b-8596-970f-9f6bf495,2019-07-03,11:00,2019-07-03,11:15,6.0,0.64,38.217,-85.757,38.221,-85.763,4,11,2019-07-03 11:00:00,2019-07-03 11:15:00,2019-07-03 11:00:00,2019-07-03 11:15:00
4,000096c8-bac3-4d6f-7ebb-30b957ee,2019-05-04,21:15,2019-05-04,21:30,7.0,0.684,38.221,-85.762,38.223,-85.764,7,21,2019-05-04 21:15:00,2019-05-04 21:30:00,2019-05-04 21:15:00,2019-05-04 21:30:00


In [None]:
df.started_at2.min(), df.started_at2.max()

In [12]:
df["started_at2"] = pd.to_datetime(df["started_at2"])
df["ended_at2"] = pd.to_datetime(df["ended_at2"])

In [13]:
df["started_at2"].min(), df["started_at2"].max(), df["ended_at2"].min(), df["ended_at2"].max()

(Timestamp('2018-08-09 10:30:00'),
 Timestamp('2019-10-31 21:15:00'),
 Timestamp('2018-08-09 10:30:00'),
 Timestamp('2019-10-31 21:30:00'))

In [14]:
s = datetime.datetime(year=2019, month=1, day=1)
e = datetime.datetime(year=2019, month=2, day=1)

df[(df["started_at2"] < e) & (df["started_at2"] >= s)].shape

(11276, 17)

In [15]:
s = datetime.datetime(year=2019, month=2, day=1)
e = datetime.datetime(year=2019, month=3, day=1)

df[(df["started_at2"] < e) & (df["started_at2"] >= s)].shape

(12536, 17)

In [59]:
s = datetime.datetime(year=2019, month=4, day=1)
e = datetime.datetime(year=2019, month=7, day=1)

df[(df["started_at2"] < e) & (df["started_at2"] >= s)].shape

(136525, 17)

In [60]:
s = datetime.datetime(year=2019, month=7, day=1)
e = datetime.datetime(year=2019, month=10, day=1)

df[(df["started_at2"] < e) & (df["started_at2"] >= s)].shape

(164084, 17)

In [26]:
df.shape

(434578, 15)

In [16]:
def generate_streaming_data(df, col="start"):
    time_colname = f"{col}ed_at2"
    lat_colname = f"{col.capitalize()}Latitude"
    lng_colname = f"{col.capitalize()}Longitude"

    pid = 0

    year_start, month_start, day_start = 2018, 10, 1

    while True:
        year_end, month_end, day_end = year_start, month_start + 1, 1
        if month_end > 12:
            year_end += 1
            month_end -= 12
            
        s = datetime.datetime(year=year_start, month=month_start, day=day_start)
        e = datetime.datetime(year=year_end, month=month_end, day=day_end)
        
        subdf = df[(df[time_colname] >= s) & (df[time_colname] < e)]
        subdf = subdf.sort_values(by=time_colname)

        with open(f"parsed/month1/city_Lousiville_escooter_trip_{col}_streaming-{year_start}-{month_start}.txt", "w") as f:
            for i in tqdm(range(len(subdf))):
                time, lat, lng = subdf.iloc[i][[time_colname, lat_colname, lng_colname]]
                f.write(f"create {pid} {lat} {lng} {time}\n")
                pid += 1

        if year_start == 2019 and month_start == 7: 
            break
        
        month_start += 1
        if month_start > 12:
            year_start += 1
            month_start -= 12

In [17]:
generate_streaming_data(df, "start")
generate_streaming_data(df, "end")

100%|██████████| 15346/15346 [00:02<00:00, 7661.55it/s]
100%|██████████| 17182/17182 [00:02<00:00, 7607.22it/s]
100%|██████████| 13066/13066 [00:01<00:00, 7887.42it/s]
100%|██████████| 11276/11276 [00:01<00:00, 7864.54it/s]
100%|██████████| 12536/12536 [00:01<00:00, 7590.46it/s]
100%|██████████| 19776/19776 [00:02<00:00, 7752.49it/s]
100%|██████████| 34422/34422 [00:04<00:00, 7595.33it/s]
100%|██████████| 50333/50333 [00:06<00:00, 7560.21it/s]
100%|██████████| 51770/51770 [00:06<00:00, 7591.57it/s]
100%|██████████| 63096/63096 [00:08<00:00, 7607.18it/s]
100%|██████████| 15346/15346 [00:02<00:00, 7431.51it/s]
100%|██████████| 17181/17181 [00:02<00:00, 7669.40it/s]
100%|██████████| 13062/13062 [00:01<00:00, 7701.29it/s]
100%|██████████| 11280/11280 [00:01<00:00, 7656.18it/s]
100%|██████████| 12535/12535 [00:01<00:00, 6942.28it/s]
100%|██████████| 19778/19778 [00:02<00:00, 7498.91it/s]
100%|██████████| 34415/34415 [00:04<00:00, 7539.61it/s]
100%|██████████| 50326/50326 [00:06<00:00, 7703.

In [None]:
import folium, io
from PIL import Image


time_colname = f"started_at2"
lat_colname = f"StartLatitude"
lng_colname = f"StartLongitude"

mean_lat = df[lat_colname].mean()
mean_long = df[lng_colname].mean()

# pid = 0

# year_start, month_start, day_start = 2018, 10, 1

# while True:
#     m = folium.Map(location=[mean_lat, mean_long], zoom_start=14)
#     year_end, month_end, day_end = year_start, month_start + 1, 1
#     if month_end > 12:
#         year_end += 1
#         month_end -= 12
        
#     s = datetime.datetime(year=year_start, month=month_start, day=day_start)
#     e = datetime.datetime(year=year_end, month=month_end, day=day_end)
    
#     subdf = df[(df[time_colname] >= s) & (df[time_colname] < e)]
#     subdf = subdf.sort_values(by=time_colname)

#     for i in tqdm(range(len(subdf))):
#         lat, lng = subdf.iloc[i][[lat_colname, lng_colname]]
#         print(lat, lng)
#         folium.CircleMarker(
#             location=[lat, lng],
#             radius=5
#         ).add_to(m)

#     map_image = m._to_png(delay=5)
#     image = Image.open(io.BytesIO(map_image))
#     image.save(f"vis/test_2020_week_{year_start}-{month_start}-{day_start}.png")

#     if year_start == 2019 and month_start == 3: 
#         break
    
#     month_start += 1
#     if month_start > 12:
#         year_start += 1
#         month_start -= 12


In [50]:
m = folium.Map(location=[mean_lat, mean_long], zoom_start=14)


for i in range(500):
    loc = df.iloc[i][["StartLatitude", "StartLongitude"]].to_numpy()
    loc = loc + np.random.normal(size=2, scale=1e-4)
    folium.CircleMarker(
        location=loc,
        radius=5
    ).add_to(m)

m