# df_points with UNIX Timestamp #
## Creates df_points.csv with all points and idletimes #
## Checks the valid geo location
## deletes the invalid points: 

2018 - invalides: 41743 from 139171 that is 29.994036113845556 %

2019 - invalides: 52523 from 550887 that is 9.534260202183024 % - 05:40:19

2020 - invalides: 24163 from 539064 that is 4.482399121440126 % - 05:40:06

2021 - invalides: 21382 from 552253 that is 3.8717761605640897 % - 05:51:53

### ca 1h for 1 year

In [1]:
def import_data(source):
    return pd.read_csv(source)

In [2]:
def dic_out_of_df(df01):
    import pandas as pd
    from tqdm import tqdm
    # Dict out of DataFrames of Trips per Bike

    # init DFs for every bike
    myDFs = {}

    for ind in tqdm(df01.index):
        if df01['bike_id'][ind] not in myDFs:
            myDFs[df01['bike_id'][ind]] = pd.DataFrame(
                columns=['bike_id',
                         'start_time',
                         'end_time',
                         'start_lat',
                         'start_lng',
                         'end_lat',
                         'end_lng',
                         'end_station_number'])
    return myDFs

In [3]:
def add_rentals_to_df(df01,myDFs):
    from tqdm import tqdm
    # Add all rentals to their bike_id DF
    for ind in tqdm(df01.index):
        if df01['bike_id'][ind] in myDFs:
            myDFs[df01['bike_id'][ind]].loc[df01.index[ind]] = df01.iloc[ind]
    return myDFs

In [4]:
def sort_dfs(myDFs):
    # sort every DF
    for df in myDFs.values():
        df[['bike_id', 'start_time','end_time']] = df[['bike_id', 'start_time','end_time']].astype(int)
        df[['start_lat','start_lng', 'end_lat', 'end_lng']] = df[['start_lat','start_lng', 'end_lat', 'end_lng']].astype(float)
        df.sort_values(by=['start_time'], inplace=True)
        df.reset_index(drop=True, inplace=True)
    return myDFs

In [5]:
# returns distance in meter
# source: https://www.it-swarm.com.de/de/python/wie-kann-ich-die-entfernung-zwischen-zwei-punkten-breitengrad-laengengrad-schnell-schaetzen/1072488907/
def get_distance(Lat1, Long1, Lat2, Long2):
    x = Lat2 - Lat1
    y = (Long2 - Long1)*cos((Lat2 + Lat1)*0.00872664626)
    return 111.138*sqrt(x*x+y*y)*1000

In [6]:
def create_df_points(myDFs,radius):
    # DF of all points with stoodtime
    in_val = 0
    val = 0 
    
    df_points = pd.DataFrame(columns=['bike_id', 'lat', 'lng', 'idle_time', 'time_start', 'time_end', 'end_station_number'])
    #df_points[['bike_id', 'idle_time', 'time_start', 'time_end']] = df_points[['bike_id', 'idle_time', 'time_start', 'time_end']].astype(int)
    #df_points[['lng', 'lat']] = df_points[['lng', 'lat']].astype(float)
    
    for df in tqdm(myDFs.values()):
        for ind in df.index[1:]:
            #in SECOUNDS
            _idle_time = (df['start_time'][ind] - df['end_time'][ind - 1]).astype(int)
            distance = get_distance(df['end_lat'][ind - 1], df['end_lng'][ind - 1], df['start_lat'][ind], df['start_lng'][ind])
            if distance > radius:
                in_val +=1
                continue
            val +=1
            dict = {'bike_id': df['bike_id'][ind], 'lng': df['start_lng'][ind], 'lat': df['start_lat'][ind],
                    'idle_time': _idle_time, 'time_start': df['end_time'][ind - 1], 'time_end': df['start_time'][ind], 'end_station_number': df['end_station_number'][ind-1]}
            df_points = df_points.append(dict, ignore_index=True)
    print(f"invalides: {in_val} from {val} that is {in_val/val * 100} %")
    return df_points

In [7]:
def keplerMap(df_points):
    heatmap = KeplerGl()
    heatmap.add_data(data=df_points, name='points')
    heatmap.save_to_html(file_name='heatmap_test.html')

# Main 
## Create map

In [8]:
from keplergl import KeplerGl
import pandas as pd
pd.options.mode.chained_assignment = None
import time

ts1 = time.time()

source = '../data/raw_lendings/raw_lendings_2021.csv'

df = import_data(source)
myDfs = dic_out_of_df(df)
myDfs = add_rentals_to_df(df, myDfs)
myDfs = sort_dfs(myDfs)


100%|██████████| 574838/574838 [00:05<00:00, 96933.88it/s] 
100%|██████████| 574838/574838 [13:18<00:00, 719.79it/s]


# Create DF_points
## without invalid points

In [9]:
from math import cos, sqrt
from tqdm import tqdm

radius = 20
df_points = create_df_points(myDfs,radius)

100%|██████████| 1203/1203 [50:44<00:00,  2.53s/it]

invalides: 21382 from 552253 that is 3.8717761605640897 %





In [10]:
df_points.tail()

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number
552248,102511.0,51.346111,12.357207,121093.0,1640614000.0,1640735000.0,
552249,102511.0,51.34612,12.357187,73570.0,1640736000.0,1640809000.0,
552250,102511.0,51.355333,12.370821,29032.0,1640810000.0,1640839000.0,
552251,102511.0,51.340578,12.366774,39283.0,1640839000.0,1640879000.0,
552252,102511.0,51.337324,12.358628,22694.0,1640879000.0,1640902000.0,


# Split Points at midnight

## save feature idle_time_next_day

In [11]:
df_points['idle_time'] = pd.to_timedelta(df_points['idle_time'],unit='s')
df_points['time_start'] = pd.to_datetime(df_points['time_start'],unit='s')
df_points['time_end'] = pd.to_datetime(df_points['time_end'],unit='s')

In [12]:
from tqdm import tqdm
from datetime import timedelta
import numpy as np

changed = 1
iterations = 0

df_points['idle_time_next_day'] = np.nan
df_points['over_night'] = 0

while changed:
    iterations += 1
    changed = 0
    for ind in tqdm(df_points.index):
        ts = df_points['time_start'][ind]
        te = df_points['time_end'][ind]
        t0 = ts.replace(hour=23, minute=59, second=59)
        
        # is timestood greater than same date 23:59:59
        #   -> over midnight
        if ts + df_points['idle_time'][ind] > t0:
            changed = 1
            
            underhang = t0 - ts
            overhang = te - t0
            
            # fix current day
            df_points['time_end'][ind] = ts.replace(hour=23, minute=59, second=59)
            df_points['idle_time'][ind] = underhang
            df_points['over_night'][ind] = 1
            df_points['idle_time_next_day'][ind] = overhang
            
            # add new row for the next day
            time_start = ts.replace(hour=0, minute=0, second=0)
            time_start += timedelta(days=1)
            flag = 0
            #if time_start + overhang > time_start.replace(hour=23, minute=59, second=59):
            #    flag = 1
            
            dict = {'bike_id': df_points['bike_id'][ind], 'lng': df_points['lng'][ind], 'lat': df_points['lat'][ind],
                    'idle_time': overhang, 'time_start': time_start, 'time_end': te,'over_night': flag, 'idle_time_next_day': np.nan, 'end_station_number': df_points['end_station_number'][ind]}
            df_points = df_points.append(dict, ignore_index=True)

print(iterations)

100%|██████████| 552253/552253 [1:57:07<00:00, 78.58it/s]  
100%|██████████| 691473/691473 [25:14<00:00, 456.49it/s]  
100%|██████████| 717034/717034 [12:10<00:00, 981.21it/s]  
100%|██████████| 728352/728352 [06:49<00:00, 1776.80it/s] 
100%|██████████| 734491/734491 [04:31<00:00, 2705.80it/s] 
100%|██████████| 738304/738304 [03:11<00:00, 3851.35it/s] 
100%|██████████| 740834/740834 [02:24<00:00, 5126.75it/s] 
100%|██████████| 742600/742600 [01:57<00:00, 6346.33it/s] 
100%|██████████| 743897/743897 [01:37<00:00, 7608.06it/s] 
100%|██████████| 744886/744886 [01:22<00:00, 9000.66it/s] 
100%|██████████| 745651/745651 [01:15<00:00, 9859.18it/s] 
100%|██████████| 746245/746245 [01:04<00:00, 11591.44it/s]
100%|██████████| 746714/746714 [00:57<00:00, 12877.76it/s]
100%|██████████| 747080/747080 [00:54<00:00, 13656.01it/s]
100%|██████████| 747387/747387 [00:50<00:00, 14766.50it/s]
100%|██████████| 747639/747639 [00:48<00:00, 15386.11it/s]
100%|██████████| 747842/747842 [00:45<00:00, 16328.66it

113





In [13]:
df_points

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night
0,93487.0,51.326737,12.358117,0 days 00:33:09,2021-01-03 08:38:21,2021-01-03 09:11:30,4034.0,,0
1,93487.0,51.318782,12.368816,0 days 14:41:06,2021-01-03 09:18:53,2021-01-03 23:59:59,,1 days 14:02:38,1
2,93487.0,51.328800,12.371326,0 days 09:53:19,2021-01-05 14:06:40,2021-01-05 23:59:59,,0 days 20:03:08,1
3,93487.0,51.329124,12.371294,0 days 10:33:40,2021-01-07 13:26:19,2021-01-07 23:59:59,,0 days 11:46:52,1
4,93487.0,51.335262,12.339411,0 days 01:11:45,2021-01-08 22:48:14,2021-01-08 23:59:59,,0 days 20:59:28,1
...,...,...,...,...,...,...,...,...,...
749344,97516.0,51.324809,12.336997,0 days 23:59:59,2021-10-09 00:00:00,2021-10-09 23:59:59,4051.0,3 days 18:16:18,1
749345,97516.0,51.324809,12.336997,0 days 23:59:59,2021-10-10 00:00:00,2021-10-10 23:59:59,4051.0,2 days 18:16:18,1
749346,97516.0,51.324809,12.336997,0 days 23:59:59,2021-10-11 00:00:00,2021-10-11 23:59:59,4051.0,1 days 18:16:18,1
749347,97516.0,51.324809,12.336997,0 days 23:59:59,2021-10-12 00:00:00,2021-10-12 23:59:59,4051.0,0 days 18:16:18,1


idle time in min

In [14]:
def idle_time_to_min(row):
    return row['idle_time'].seconds / 60

df_points['idle_time'] = df_points.apply(idle_time_to_min, axis=1)

add idle_time_next_day feature

In [15]:
df_points['idle_time_next_day'] = pd.to_timedelta(df_points['idle_time_next_day'])

def idle_time_next_day_to_min(row):
    if row['over_night'] == 1:
        t = row['idle_time_next_day'].total_seconds() / 60
        if t < 1439:
            return t
        else: return 1439
    else:
        return 0

df_points['idle_time_next_day'] = df_points.apply(idle_time_next_day_to_min, axis=1)


In [16]:
df_points

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night
0,93487.0,51.326737,12.358117,33.150000,2021-01-03 08:38:21,2021-01-03 09:11:30,4034.0,0.000000,0
1,93487.0,51.318782,12.368816,881.100000,2021-01-03 09:18:53,2021-01-03 23:59:59,,1439.000000,1
2,93487.0,51.328800,12.371326,593.316667,2021-01-05 14:06:40,2021-01-05 23:59:59,,1203.133333,1
3,93487.0,51.329124,12.371294,633.666667,2021-01-07 13:26:19,2021-01-07 23:59:59,,706.866667,1
4,93487.0,51.335262,12.339411,71.750000,2021-01-08 22:48:14,2021-01-08 23:59:59,,1259.466667,1
...,...,...,...,...,...,...,...,...,...
749344,97516.0,51.324809,12.336997,1439.983333,2021-10-09 00:00:00,2021-10-09 23:59:59,4051.0,1439.000000,1
749345,97516.0,51.324809,12.336997,1439.983333,2021-10-10 00:00:00,2021-10-10 23:59:59,4051.0,1439.000000,1
749346,97516.0,51.324809,12.336997,1439.983333,2021-10-11 00:00:00,2021-10-11 23:59:59,4051.0,1439.000000,1
749347,97516.0,51.324809,12.336997,1439.983333,2021-10-12 00:00:00,2021-10-12 23:59:59,4051.0,1096.300000,1


change type to int

In [17]:
df_points['idle_time'] = df_points['idle_time'].astype(int)
df_points['idle_time_next_day'] = df_points['idle_time_next_day'].astype(int)
df_points

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night
0,93487.0,51.326737,12.358117,33,2021-01-03 08:38:21,2021-01-03 09:11:30,4034.0,0,0
1,93487.0,51.318782,12.368816,881,2021-01-03 09:18:53,2021-01-03 23:59:59,,1439,1
2,93487.0,51.328800,12.371326,593,2021-01-05 14:06:40,2021-01-05 23:59:59,,1203,1
3,93487.0,51.329124,12.371294,633,2021-01-07 13:26:19,2021-01-07 23:59:59,,706,1
4,93487.0,51.335262,12.339411,71,2021-01-08 22:48:14,2021-01-08 23:59:59,,1259,1
...,...,...,...,...,...,...,...,...,...
749344,97516.0,51.324809,12.336997,1439,2021-10-09 00:00:00,2021-10-09 23:59:59,4051.0,1439,1
749345,97516.0,51.324809,12.336997,1439,2021-10-10 00:00:00,2021-10-10 23:59:59,4051.0,1439,1
749346,97516.0,51.324809,12.336997,1439,2021-10-11 00:00:00,2021-10-11 23:59:59,4051.0,1439,1
749347,97516.0,51.324809,12.336997,1439,2021-10-12 00:00:00,2021-10-12 23:59:59,4051.0,1096,1


# Save V2 df_points split

In [18]:
df_points['idle_time'].max()

1439

In [19]:
df_points.head()

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night
0,93487.0,51.326737,12.358117,33,2021-01-03 08:38:21,2021-01-03 09:11:30,4034.0,0,0
1,93487.0,51.318782,12.368816,881,2021-01-03 09:18:53,2021-01-03 23:59:59,,1439,1
2,93487.0,51.3288,12.371326,593,2021-01-05 14:06:40,2021-01-05 23:59:59,,1203,1
3,93487.0,51.329124,12.371294,633,2021-01-07 13:26:19,2021-01-07 23:59:59,,706,1
4,93487.0,51.335262,12.339411,71,2021-01-08 22:48:14,2021-01-08 23:59:59,,1259,1


## Add H3 Index

In [20]:
import h3

df_points['hex_id'] = 0

for ind in tqdm(df_points.index):
    df_points['hex_id'][ind] = h3.geo_to_h3(df_points['lat'][ind], df_points['lng'][ind], 8)
df_points.head()

100%|██████████| 749349/749349 [00:13<00:00, 56694.86it/s]


Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night,hex_id
0,93487.0,51.326737,12.358117,33,2021-01-03 08:38:21,2021-01-03 09:11:30,4034.0,0,0,881f1a164dfffff
1,93487.0,51.318782,12.368816,881,2021-01-03 09:18:53,2021-01-03 23:59:59,,1439,1,881f1a1647fffff
2,93487.0,51.3288,12.371326,593,2021-01-05 14:06:40,2021-01-05 23:59:59,,1203,1,881f1a164bfffff
3,93487.0,51.329124,12.371294,633,2021-01-07 13:26:19,2021-01-07 23:59:59,,706,1,881f1a164bfffff
4,93487.0,51.335262,12.339411,71,2021-01-08 22:48:14,2021-01-08 23:59:59,,1259,1,881f1a8ca5fffff


# Add weather Data

In [21]:
dfw = pd.read_csv('../data/weather_data.csv')
dfw['datetime'] = pd.to_datetime(dfw['dt'], unit='s')
dfw = dfw.set_index(['datetime'])
dfw.head()

df_points['temp'] = np.nan
df_points['rain'] = np.nan
df_points['snow'] = np.nan
df_points['wind_speed'] = np.nan
df_points['humidity'] = np.nan

from tqdm import tqdm
pd.options.mode.chained_assignment = None

for ind in tqdm(df_points.index):
    wint = df_points['time_start'][ind].replace(minute=0,second=0)
    df_points['temp'][ind] = dfw['temp'][wint].copy()
    df_points['rain'][ind] = dfw['rain_1h'][wint].copy()
    df_points['snow'][ind] = dfw['snow_1h'][wint].copy()
    df_points['wind_speed'][ind] = dfw['wind_speed'][wint].copy()
    df_points['humidity'][ind] = dfw['humidity'][wint].copy()

df_points

100%|██████████| 749349/749349 [16:05<00:00, 776.04it/s]


Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night,hex_id,temp,rain,snow,wind_speed,humidity
0,93487.0,51.326737,12.358117,33,2021-01-03 08:38:21,2021-01-03 09:11:30,4034.0,0,0,881f1a164dfffff,0.22,,0.51,5.81,94.0
1,93487.0,51.318782,12.368816,881,2021-01-03 09:18:53,2021-01-03 23:59:59,,1439,1,881f1a1647fffff,0.38,0.51,,0.89,95.0
2,93487.0,51.328800,12.371326,593,2021-01-05 14:06:40,2021-01-05 23:59:59,,1203,1,881f1a164bfffff,0.55,,0.25,3.13,76.0
3,93487.0,51.329124,12.371294,633,2021-01-07 13:26:19,2021-01-07 23:59:59,,706,1,881f1a164bfffff,1.91,,,7.15,77.0
4,93487.0,51.335262,12.339411,71,2021-01-08 22:48:14,2021-01-08 23:59:59,,1259,1,881f1a8ca5fffff,0.42,,0.16,0.45,92.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749344,97516.0,51.324809,12.336997,1439,2021-10-09 00:00:00,2021-10-09 23:59:59,4051.0,1439,1,881f1a8d91fffff,5.64,,,2.68,86.0
749345,97516.0,51.324809,12.336997,1439,2021-10-10 00:00:00,2021-10-10 23:59:59,4051.0,1439,1,881f1a8d91fffff,4.00,,,0.45,79.0
749346,97516.0,51.324809,12.336997,1439,2021-10-11 00:00:00,2021-10-11 23:59:59,4051.0,1439,1,881f1a8d91fffff,4.59,,,2.24,86.0
749347,97516.0,51.324809,12.336997,1439,2021-10-12 00:00:00,2021-10-12 23:59:59,4051.0,1096,1,881f1a8d91fffff,8.17,,,4.47,92.0


# Fill NaNS

In [22]:
df_points['rain'] = df_points['rain'].fillna(0)
df_points['snow'] = df_points['snow'].fillna(0)
df_points['end_station_number'] = df_points['end_station_number'].fillna(0)

df_points['wind_speed'] = df_points['wind_speed'].fillna(df_points['wind_speed'].mean())
df_points['humidity'] = df_points['humidity'].fillna(df_points['humidity'].mean())
df_points = df_points.sort_values(by=['time_start']).reset_index(drop=True)

add unix timestamps

In [23]:
df_points['dt_start'] = pd.to_datetime(df_points['time_start']).map(pd.Timestamp.timestamp).astype(int)
df_points['dt_end'] = pd.to_datetime(df_points['time_end']).map(pd.Timestamp.timestamp).astype(int)
df_points['bike_id'] = df_points['bike_id'].astype(int)
df_points['end_station_number'] = df_points['end_station_number'].astype(int)

In [24]:
df_points


Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night,hex_id,temp,rain,snow,wind_speed,humidity,dt_start,dt_end
0,72165,51.351547,12.382658,38,2020-12-31 23:21:34,2020-12-31 23:59:59,0,1439,1,881f1a8cb9fffff,2.72,0.00,0.0,4.02,72.0,1609456894,1609459199
1,75838,51.331022,12.316226,37,2020-12-31 23:22:04,2020-12-31 23:59:59,0,1439,1,881f1a8dd7fffff,2.72,0.00,0.0,4.02,72.0,1609456924,1609459199
2,74466,51.355200,12.370664,36,2020-12-31 23:23:09,2020-12-31 23:59:59,0,49,1,881f1a8c87fffff,2.72,0.00,0.0,4.02,72.0,1609456989,1609459199
3,73806,51.328693,12.346637,34,2020-12-31 23:25:41,2020-12-31 23:59:59,0,852,1,881f1a164dfffff,2.72,0.00,0.0,4.02,72.0,1609457141,1609459199
4,75854,51.331707,12.371241,22,2020-12-31 23:37:01,2020-12-31 23:59:59,0,239,1,881f1a164bfffff,2.72,0.00,0.0,4.02,72.0,1609457821,1609459199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749344,72058,51.317569,12.382730,112,2021-12-30 20:53:13,2021-12-30 22:46:08,0,0,0,881f1a1609fffff,13.19,7.62,0.0,8.94,83.0,1640897593,1640904368
749345,71949,51.334711,12.356763,25,2021-12-30 20:57:15,2021-12-30 21:22:23,0,0,0,881f1a1649fffff,13.19,7.62,0.0,8.94,83.0,1640897835,1640899343
749346,97785,51.367276,12.369792,48,2021-12-30 21:01:26,2021-12-30 21:49:45,0,0,0,881f1a8c8bfffff,13.05,2.03,0.0,8.05,84.0,1640898086,1640900985
749347,41738,51.366942,12.370322,43,2021-12-30 21:48:38,2021-12-30 22:31:56,0,0,0,881f1a8c8bfffff,13.05,2.03,0.0,8.05,84.0,1640900918,1640903516


# Add start time Feature

In [25]:
def add_start_min_feature(row):
    return ((row['dt_start'] % 86400) / 60).__round__(0)

df_points['start_min'] = df_points.apply(add_start_min_feature,axis=1)
df_points['start_min'] = df_points['start_min'].astype(int)

In [26]:
df_points

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night,hex_id,temp,rain,snow,wind_speed,humidity,dt_start,dt_end,start_min
0,72165,51.351547,12.382658,38,2020-12-31 23:21:34,2020-12-31 23:59:59,0,1439,1,881f1a8cb9fffff,2.72,0.00,0.0,4.02,72.0,1609456894,1609459199,1402
1,75838,51.331022,12.316226,37,2020-12-31 23:22:04,2020-12-31 23:59:59,0,1439,1,881f1a8dd7fffff,2.72,0.00,0.0,4.02,72.0,1609456924,1609459199,1402
2,74466,51.355200,12.370664,36,2020-12-31 23:23:09,2020-12-31 23:59:59,0,49,1,881f1a8c87fffff,2.72,0.00,0.0,4.02,72.0,1609456989,1609459199,1403
3,73806,51.328693,12.346637,34,2020-12-31 23:25:41,2020-12-31 23:59:59,0,852,1,881f1a164dfffff,2.72,0.00,0.0,4.02,72.0,1609457141,1609459199,1406
4,75854,51.331707,12.371241,22,2020-12-31 23:37:01,2020-12-31 23:59:59,0,239,1,881f1a164bfffff,2.72,0.00,0.0,4.02,72.0,1609457821,1609459199,1417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749344,72058,51.317569,12.382730,112,2021-12-30 20:53:13,2021-12-30 22:46:08,0,0,0,881f1a1609fffff,13.19,7.62,0.0,8.94,83.0,1640897593,1640904368,1253
749345,71949,51.334711,12.356763,25,2021-12-30 20:57:15,2021-12-30 21:22:23,0,0,0,881f1a1649fffff,13.19,7.62,0.0,8.94,83.0,1640897835,1640899343,1257
749346,97785,51.367276,12.369792,48,2021-12-30 21:01:26,2021-12-30 21:49:45,0,0,0,881f1a8c8bfffff,13.05,2.03,0.0,8.05,84.0,1640898086,1640900985,1261
749347,41738,51.366942,12.370322,43,2021-12-30 21:48:38,2021-12-30 22:31:56,0,0,0,881f1a8c8bfffff,13.05,2.03,0.0,8.05,84.0,1640900918,1640903516,1309


# Add day Feature

In [27]:
df_points.dtypes

bike_id                        int64
lat                          float64
lng                          float64
idle_time                      int64
time_start            datetime64[ns]
time_end              datetime64[ns]
end_station_number             int64
idle_time_next_day             int64
over_night                     int64
hex_id                        object
temp                         float64
rain                         float64
snow                         float64
wind_speed                   float64
humidity                     float64
dt_start                       int64
dt_end                         int64
start_min                      int64
dtype: object

In [28]:
def add_day_feature(row):
    return row['time_start'].dayofweek

df_points['day'] = df_points.apply(add_day_feature,axis=1)

In [29]:
df_points

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night,hex_id,temp,rain,snow,wind_speed,humidity,dt_start,dt_end,start_min,day
0,72165,51.351547,12.382658,38,2020-12-31 23:21:34,2020-12-31 23:59:59,0,1439,1,881f1a8cb9fffff,2.72,0.00,0.0,4.02,72.0,1609456894,1609459199,1402,3
1,75838,51.331022,12.316226,37,2020-12-31 23:22:04,2020-12-31 23:59:59,0,1439,1,881f1a8dd7fffff,2.72,0.00,0.0,4.02,72.0,1609456924,1609459199,1402,3
2,74466,51.355200,12.370664,36,2020-12-31 23:23:09,2020-12-31 23:59:59,0,49,1,881f1a8c87fffff,2.72,0.00,0.0,4.02,72.0,1609456989,1609459199,1403,3
3,73806,51.328693,12.346637,34,2020-12-31 23:25:41,2020-12-31 23:59:59,0,852,1,881f1a164dfffff,2.72,0.00,0.0,4.02,72.0,1609457141,1609459199,1406,3
4,75854,51.331707,12.371241,22,2020-12-31 23:37:01,2020-12-31 23:59:59,0,239,1,881f1a164bfffff,2.72,0.00,0.0,4.02,72.0,1609457821,1609459199,1417,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749344,72058,51.317569,12.382730,112,2021-12-30 20:53:13,2021-12-30 22:46:08,0,0,0,881f1a1609fffff,13.19,7.62,0.0,8.94,83.0,1640897593,1640904368,1253,3
749345,71949,51.334711,12.356763,25,2021-12-30 20:57:15,2021-12-30 21:22:23,0,0,0,881f1a1649fffff,13.19,7.62,0.0,8.94,83.0,1640897835,1640899343,1257,3
749346,97785,51.367276,12.369792,48,2021-12-30 21:01:26,2021-12-30 21:49:45,0,0,0,881f1a8c8bfffff,13.05,2.03,0.0,8.05,84.0,1640898086,1640900985,1261,3
749347,41738,51.366942,12.370322,43,2021-12-30 21:48:38,2021-12-30 22:31:56,0,0,0,881f1a8c8bfffff,13.05,2.03,0.0,8.05,84.0,1640900918,1640903516,1309,3


# Add month feature

In [30]:
def add_month_feature(row):
    return row['time_start'].strftime("%m")

df_points['month'] = df_points.apply(add_month_feature,axis=1)

In [31]:
#del df_points['time_start']
#del df_points['time_end']

Encode Hex_id

from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df_points['hex_enc']= label_encoder.fit_transform(df_points['hex_id'])

Sort DF

In [32]:
df_points

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night,hex_id,temp,rain,snow,wind_speed,humidity,dt_start,dt_end,start_min,day,month
0,72165,51.351547,12.382658,38,2020-12-31 23:21:34,2020-12-31 23:59:59,0,1439,1,881f1a8cb9fffff,2.72,0.00,0.0,4.02,72.0,1609456894,1609459199,1402,3,12
1,75838,51.331022,12.316226,37,2020-12-31 23:22:04,2020-12-31 23:59:59,0,1439,1,881f1a8dd7fffff,2.72,0.00,0.0,4.02,72.0,1609456924,1609459199,1402,3,12
2,74466,51.355200,12.370664,36,2020-12-31 23:23:09,2020-12-31 23:59:59,0,49,1,881f1a8c87fffff,2.72,0.00,0.0,4.02,72.0,1609456989,1609459199,1403,3,12
3,73806,51.328693,12.346637,34,2020-12-31 23:25:41,2020-12-31 23:59:59,0,852,1,881f1a164dfffff,2.72,0.00,0.0,4.02,72.0,1609457141,1609459199,1406,3,12
4,75854,51.331707,12.371241,22,2020-12-31 23:37:01,2020-12-31 23:59:59,0,239,1,881f1a164bfffff,2.72,0.00,0.0,4.02,72.0,1609457821,1609459199,1417,3,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749344,72058,51.317569,12.382730,112,2021-12-30 20:53:13,2021-12-30 22:46:08,0,0,0,881f1a1609fffff,13.19,7.62,0.0,8.94,83.0,1640897593,1640904368,1253,3,12
749345,71949,51.334711,12.356763,25,2021-12-30 20:57:15,2021-12-30 21:22:23,0,0,0,881f1a1649fffff,13.19,7.62,0.0,8.94,83.0,1640897835,1640899343,1257,3,12
749346,97785,51.367276,12.369792,48,2021-12-30 21:01:26,2021-12-30 21:49:45,0,0,0,881f1a8c8bfffff,13.05,2.03,0.0,8.05,84.0,1640898086,1640900985,1261,3,12
749347,41738,51.366942,12.370322,43,2021-12-30 21:48:38,2021-12-30 22:31:56,0,0,0,881f1a8c8bfffff,13.05,2.03,0.0,8.05,84.0,1640900918,1640903516,1309,3,12


df_points = df_points.reindex(columns=['bike_id', 'lat', 'lng', 'dt_start', 'dt_end', 'hex_id', 'temp', 'rain', 'snow', 'wind_speed', 'humidity', 'month', 'day', 'start_min', 'over_night', 'idle_time_next_day', 'idle_time'])


df_points.sort_values(by=['dt_start'])

In [33]:
df_points

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night,hex_id,temp,rain,snow,wind_speed,humidity,dt_start,dt_end,start_min,day,month
0,72165,51.351547,12.382658,38,2020-12-31 23:21:34,2020-12-31 23:59:59,0,1439,1,881f1a8cb9fffff,2.72,0.00,0.0,4.02,72.0,1609456894,1609459199,1402,3,12
1,75838,51.331022,12.316226,37,2020-12-31 23:22:04,2020-12-31 23:59:59,0,1439,1,881f1a8dd7fffff,2.72,0.00,0.0,4.02,72.0,1609456924,1609459199,1402,3,12
2,74466,51.355200,12.370664,36,2020-12-31 23:23:09,2020-12-31 23:59:59,0,49,1,881f1a8c87fffff,2.72,0.00,0.0,4.02,72.0,1609456989,1609459199,1403,3,12
3,73806,51.328693,12.346637,34,2020-12-31 23:25:41,2020-12-31 23:59:59,0,852,1,881f1a164dfffff,2.72,0.00,0.0,4.02,72.0,1609457141,1609459199,1406,3,12
4,75854,51.331707,12.371241,22,2020-12-31 23:37:01,2020-12-31 23:59:59,0,239,1,881f1a164bfffff,2.72,0.00,0.0,4.02,72.0,1609457821,1609459199,1417,3,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749344,72058,51.317569,12.382730,112,2021-12-30 20:53:13,2021-12-30 22:46:08,0,0,0,881f1a1609fffff,13.19,7.62,0.0,8.94,83.0,1640897593,1640904368,1253,3,12
749345,71949,51.334711,12.356763,25,2021-12-30 20:57:15,2021-12-30 21:22:23,0,0,0,881f1a1649fffff,13.19,7.62,0.0,8.94,83.0,1640897835,1640899343,1257,3,12
749346,97785,51.367276,12.369792,48,2021-12-30 21:01:26,2021-12-30 21:49:45,0,0,0,881f1a8c8bfffff,13.05,2.03,0.0,8.05,84.0,1640898086,1640900985,1261,3,12
749347,41738,51.366942,12.370322,43,2021-12-30 21:48:38,2021-12-30 22:31:56,0,0,0,881f1a8c8bfffff,13.05,2.03,0.0,8.05,84.0,1640900918,1640903516,1309,3,12


# Save

df_points.to_csv('../data/df_points/df_points_2018.csv', index=False)

import pandas as pd
df = pd.read_csv('../data/df_points/final_df_points_18_21.csv')
df = df.sort_values(by=['dt_start'])

# Add year Feature

In [34]:
def add_year_feature(row):
    return row['time_start'].year

df_points['year'] = df_points.apply(add_year_feature,axis=1)

# Add In_zone Feature

In [35]:
import os
import json
from shapely.geometry import shape, Point

flexzones_0 = []
flexzones_1 = []

def save_flexzones():
    # Flexzone 0 Euro
    directory_name_0 = f'../flexzones/0/'
    for file_name in os.listdir(directory_name_0):
        path = os.path.join(directory_name_0, file_name)
        if os.path.isfile(path):
           with open(path) as f:
                js = json.load(f)
                for feature in js['features']:
                    flexzones_0.append(feature)
    # Flexzone 1 Euro
    directory_name_1 = f'../flexzones/1/'
    for file_name in os.listdir(directory_name_1):
        path = os.path.join(directory_name_1, file_name)
        if os.path.isfile(path):
           with open(path) as f:
                js = json.load(f)
                for feature in js['features']:
                    flexzones_1.append(feature)


def point_in_polygons(row):
    y = row['lat']  # y = 51.331305
    x = row['lng']  # x = 12.344334

    # switch x and y-axis bc. scapely woks on x plane
    point = Point(x,y)

    for feature0 in flexzones_0:
        polygon = shape(feature0['geometry'])
        if polygon.contains(point):
            return feature0['properties']['name']

    for feature1 in flexzones_1:
        polygon = shape(feature1['geometry'])
        if polygon.contains(point):
            return feature1['properties']['name']
    else: return 0

In [36]:
save_flexzones()

In [37]:
df_points['zone_name'] = df_points.apply(point_in_polygons,axis=1)

In [38]:
df_points

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night,hex_id,...,snow,wind_speed,humidity,dt_start,dt_end,start_min,day,month,year,zone_name
0,72165,51.351547,12.382658,38,2020-12-31 23:21:34,2020-12-31 23:59:59,0,1439,1,881f1a8cb9fffff,...,0.0,4.02,72.0,1609456894,1609459199,1402,3,12,2020,0
1,75838,51.331022,12.316226,37,2020-12-31 23:22:04,2020-12-31 23:59:59,0,1439,1,881f1a8dd7fffff,...,0.0,4.02,72.0,1609456924,1609459199,1402,3,12,2020,Pinke Zone Leipzig West 1€
2,74466,51.355200,12.370664,36,2020-12-31 23:23:09,2020-12-31 23:59:59,0,49,1,881f1a8c87fffff,...,0.0,4.02,72.0,1609456989,1609459199,1403,3,12,2020,Pinke Zone Leipzig Ost 1€
3,73806,51.328693,12.346637,34,2020-12-31 23:25:41,2020-12-31 23:59:59,0,852,1,881f1a164dfffff,...,0.0,4.02,72.0,1609457141,1609459199,1406,3,12,2020,0
4,75854,51.331707,12.371241,22,2020-12-31 23:37:01,2020-12-31 23:59:59,0,239,1,881f1a164bfffff,...,0.0,4.02,72.0,1609457821,1609459199,1417,3,12,2020,Blaue Zone Leipzig Ost 0€
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749344,72058,51.317569,12.382730,112,2021-12-30 20:53:13,2021-12-30 22:46:08,0,0,0,881f1a1609fffff,...,0.0,8.94,83.0,1640897593,1640904368,1253,3,12,2021,Blaue Zone Leipzig Ost 0€
749345,71949,51.334711,12.356763,25,2021-12-30 20:57:15,2021-12-30 21:22:23,0,0,0,881f1a1649fffff,...,0.0,8.94,83.0,1640897835,1640899343,1257,3,12,2021,Pinke Zone Leipzig Ost 1€
749346,97785,51.367276,12.369792,48,2021-12-30 21:01:26,2021-12-30 21:49:45,0,0,0,881f1a8c8bfffff,...,0.0,8.05,84.0,1640898086,1640900985,1261,3,12,2021,Blaue Zone Leipzig Ost 0€
749347,41738,51.366942,12.370322,43,2021-12-30 21:48:38,2021-12-30 22:31:56,0,0,0,881f1a8c8bfffff,...,0.0,8.05,84.0,1640900918,1640903516,1309,3,12,2021,Pinke Zone Leipzig Ost 1€


In [39]:
def in_zone(row):
    if row['zone_name'] != 0:
        return 1
    else: return 0

df_points['in_zone'] = df_points.apply(in_zone,axis=1)

In [40]:
df_points

Unnamed: 0,bike_id,lat,lng,idle_time,time_start,time_end,end_station_number,idle_time_next_day,over_night,hex_id,...,wind_speed,humidity,dt_start,dt_end,start_min,day,month,year,zone_name,in_zone
0,72165,51.351547,12.382658,38,2020-12-31 23:21:34,2020-12-31 23:59:59,0,1439,1,881f1a8cb9fffff,...,4.02,72.0,1609456894,1609459199,1402,3,12,2020,0,0
1,75838,51.331022,12.316226,37,2020-12-31 23:22:04,2020-12-31 23:59:59,0,1439,1,881f1a8dd7fffff,...,4.02,72.0,1609456924,1609459199,1402,3,12,2020,Pinke Zone Leipzig West 1€,1
2,74466,51.355200,12.370664,36,2020-12-31 23:23:09,2020-12-31 23:59:59,0,49,1,881f1a8c87fffff,...,4.02,72.0,1609456989,1609459199,1403,3,12,2020,Pinke Zone Leipzig Ost 1€,1
3,73806,51.328693,12.346637,34,2020-12-31 23:25:41,2020-12-31 23:59:59,0,852,1,881f1a164dfffff,...,4.02,72.0,1609457141,1609459199,1406,3,12,2020,0,0
4,75854,51.331707,12.371241,22,2020-12-31 23:37:01,2020-12-31 23:59:59,0,239,1,881f1a164bfffff,...,4.02,72.0,1609457821,1609459199,1417,3,12,2020,Blaue Zone Leipzig Ost 0€,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749344,72058,51.317569,12.382730,112,2021-12-30 20:53:13,2021-12-30 22:46:08,0,0,0,881f1a1609fffff,...,8.94,83.0,1640897593,1640904368,1253,3,12,2021,Blaue Zone Leipzig Ost 0€,1
749345,71949,51.334711,12.356763,25,2021-12-30 20:57:15,2021-12-30 21:22:23,0,0,0,881f1a1649fffff,...,8.94,83.0,1640897835,1640899343,1257,3,12,2021,Pinke Zone Leipzig Ost 1€,1
749346,97785,51.367276,12.369792,48,2021-12-30 21:01:26,2021-12-30 21:49:45,0,0,0,881f1a8c8bfffff,...,8.05,84.0,1640898086,1640900985,1261,3,12,2021,Blaue Zone Leipzig Ost 0€,1
749347,41738,51.366942,12.370322,43,2021-12-30 21:48:38,2021-12-30 22:31:56,0,0,0,881f1a8c8bfffff,...,8.05,84.0,1640900918,1640903516,1309,3,12,2021,Pinke Zone Leipzig Ost 1€,1


In [41]:
df_points = df_points.reindex(columns=['bike_id', 'lat', 'lng', 'dt_start', 'dt_end', 'hex_id', 'in_zone', 'zone_name','end_station_number','temp', 'rain', 'snow', 'wind_speed', 'humidity', 'year','month', 'day', 'start_min', 'over_night', 'idle_time_next_day', 'idle_time'])

df_points = df_points.sort_values(by=['dt_start'])
df_points

Unnamed: 0,bike_id,lat,lng,dt_start,dt_end,hex_id,in_zone,zone_name,end_station_number,temp,...,snow,wind_speed,humidity,year,month,day,start_min,over_night,idle_time_next_day,idle_time
0,72165,51.351547,12.382658,1609456894,1609459199,881f1a8cb9fffff,0,0,0,2.72,...,0.0,4.02,72.0,2020,12,3,1402,1,1439,38
1,75838,51.331022,12.316226,1609456924,1609459199,881f1a8dd7fffff,1,Pinke Zone Leipzig West 1€,0,2.72,...,0.0,4.02,72.0,2020,12,3,1402,1,1439,37
2,74466,51.355200,12.370664,1609456989,1609459199,881f1a8c87fffff,1,Pinke Zone Leipzig Ost 1€,0,2.72,...,0.0,4.02,72.0,2020,12,3,1403,1,49,36
3,73806,51.328693,12.346637,1609457141,1609459199,881f1a164dfffff,0,0,0,2.72,...,0.0,4.02,72.0,2020,12,3,1406,1,852,34
4,75854,51.331707,12.371241,1609457821,1609459199,881f1a164bfffff,1,Blaue Zone Leipzig Ost 0€,0,2.72,...,0.0,4.02,72.0,2020,12,3,1417,1,239,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749344,72058,51.317569,12.382730,1640897593,1640904368,881f1a1609fffff,1,Blaue Zone Leipzig Ost 0€,0,13.19,...,0.0,8.94,83.0,2021,12,3,1253,0,0,112
749345,71949,51.334711,12.356763,1640897835,1640899343,881f1a1649fffff,1,Pinke Zone Leipzig Ost 1€,0,13.19,...,0.0,8.94,83.0,2021,12,3,1257,0,0,25
749346,97785,51.367276,12.369792,1640898086,1640900985,881f1a8c8bfffff,1,Blaue Zone Leipzig Ost 0€,0,13.05,...,0.0,8.05,84.0,2021,12,3,1261,0,0,48
749347,41738,51.366942,12.370322,1640900918,1640903516,881f1a8c8bfffff,1,Pinke Zone Leipzig Ost 1€,0,13.05,...,0.0,8.05,84.0,2021,12,3,1309,0,0,43


# SAVE

In [42]:
dur = time.time() - ts1
print(dur)
ty_res = time.gmtime(dur)
res = time.strftime("%H:%M:%S",ty_res)
print(res)

21113.20941901207
05:51:53


In [43]:
df_points.to_csv('../data/df_points/df_points_2021.csv', index=False)