# df_points with UNIX Timestamp #
## Creates df_points.csv with all points and idletimes #
## Checks the valid geo location
## deletes the invalid points: 

2018 - invalides:   41743 from 139171 that is 29.994036113845556 %

2019 - invalides:   52523 from 550887 that is 9.534260202183024 % - 05:40:19

2020 - invalides:   18086 from 662386 that is 2.7304321045432722 %- 05:40:06

2021 - invalides:   21382 from 552253 that is 3.8717761605640897 % - 05:51:53

### ca 1h for 1 year

In [11]:
def import_data(source):
    return pd.read_csv(source)

In [12]:
def dic_out_of_df(df01):
    import pandas as pd
    from tqdm import tqdm
    # Dict out of DataFrames of Trips per Bike

    # init DFs for every bike
    myDFs = {}

    for ind in tqdm(df01.index):
        if df01['bike_id'][ind] not in myDFs:
            myDFs[df01['bike_id'][ind]] = pd.DataFrame(
                columns=['bike_id',
                         'start_time',
                         'end_time',
                         'start_lat',
                         'start_lng',
                         'end_lat',
                         'end_lng',
                         'end_station_number'])
    return myDFs

In [13]:
def add_rentals_to_df(df01,myDFs):
    from tqdm import tqdm
    # Add all rentals to their bike_id DF
    for ind in tqdm(df01.index):
        if df01['bike_id'][ind] in myDFs:
            myDFs[df01['bike_id'][ind]].loc[df01.index[ind]] = df01.iloc[ind]
    return myDFs

In [14]:
def sort_dfs(myDFs):
    # sort every DF
    for df in myDFs.values():
        df[['bike_id', 'start_time','end_time']] = df[['bike_id', 'start_time','end_time']].astype(int)
        df[['start_lat','start_lng', 'end_lat', 'end_lng']] = df[['start_lat','start_lng', 'end_lat', 'end_lng']].astype(float)
        df.sort_values(by=['start_time'], inplace=True)
        df.reset_index(drop=True, inplace=True)
    return myDFs

In [15]:
# returns distance in meter

from math import radians, cos, sin, asin, sqrt
def haversine_distance(lat1, lon1, lat2, lon2):
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    # Radius of earth in kilometers is 6371
    km = 6371 * c
    return km * 1000

In [16]:
def create_df_points(myDFs, radius):
    # DF of all points with idle_time
    in_val = 0
    val = 0
    errors = []
    
    df_points = pd.DataFrame(columns=['bike_id', 'lat', 'lng', 'idle_time', 'time_start', 'time_end', 'station_number'])
    
    for df in tqdm(myDFs.values()):
        for ind in df.index[1:]:
            # comp idle_time in SECONDS
            _idle_time = (df['start_time'][ind] - df['end_time'][ind - 1]).astype(int)

            # check if lng lat == 0.0  ->  false data
            if df['end_lat'][ind - 1] == 0.0 or df['end_lng'][ind - 1] == 0:
                in_val += 1
                continue

            # compute distance of start and end point and check if it is > than tolerance
            distance = haversine_distance(df['end_lat'][ind - 1], df['end_lng'][ind - 1], df['start_lat'][ind], df['start_lng'][ind])
            if distance > radius:
                errors.append(distance)
                in_val +=1
                continue
            val +=1
            # add new df_point entry to df
            dict = {'bike_id': df['bike_id'][ind], 'lng': df['start_lng'][ind], 'lat': df['start_lat'][ind],
                    'idle_time': _idle_time, 'time_start': df['end_time'][ind - 1], 'time_end': df['start_time'][ind], 'station_number': df['end_station_number'][ind-1]}
            df_points = df_points.append(dict, ignore_index=True)

    print(f"invalids: {in_val} from {val} that is {in_val/val * 100} %")
    print(errors)
    return df_points

# Main 
## Create map

In [18]:
from keplergl import KeplerGl
import pandas as pd
pd.options.mode.chained_assignment = None
import time

ts1 = time.time()

source = '../../data/raw_lendings/all_raw_lendings.csv'

df = import_data(source)
df.tail()

Unnamed: 0,bike_id,start_time,end_time,start_lat,start_lng,end_lat,end_lng,end_station_number
2043433,71933,1640904867,1640905631,51.320938,12.373708,51.331796,12.405827,
2043434,72257,1640904911,1640905492,51.339089,12.326971,51.330502,12.315658,
2043435,93600,1640904922,1640906099,51.33708,12.332663,51.342862,12.395077,
2043436,100236,1640905127,1640906811,51.332933,12.404402,51.305751,12.373562,
2043437,72028,1640905163,1640906794,51.332844,12.404319,51.30572,12.373641,


In [19]:
myDfs = dic_out_of_df(df)
myDfs = add_rentals_to_df(df, myDfs)
myDfs = sort_dfs(myDfs)

100%|██████████| 2043438/2043438 [00:12<00:00, 163807.49it/s]
100%|██████████| 2043438/2043438 [46:13<00:00, 736.68it/s] 


# Create DF_points
## without invalid points

In [None]:
from math import cos, sqrt
from tqdm import tqdm

# Toleranz von 20 m
radius = 20
df_points = create_df_points(myDfs,radius)
df_points.tail()

# Split Points at midnight

## save feature idle_time_next_day

In [22]:
df_points['idle_time'] = pd.to_timedelta(df_points['idle_time'],unit='s')
df_points['time_start'] = pd.to_datetime(df_points['time_start'],unit='s')
df_points['time_end'] = pd.to_datetime(df_points['time_end'],unit='s')

In [None]:
from tqdm import tqdm
from datetime import timedelta
import numpy as np

changed = 1
iterations = 0

df_points['idle_time_next_day'] = np.nan
df_points['over_night'] = 0

while changed:
    iterations += 1
    changed = 0
    for ind in tqdm(df_points.index):
        ts = df_points['time_start'][ind]
        te = df_points['time_end'][ind]
        t0 = ts.replace(hour=23, minute=59, second=59)
        
        # is timestood greater than same date 23:59:59
        #   -> over midnight
        if ts + df_points['idle_time'][ind] > t0:
            changed = 1
            
            underhang = t0 - ts
            overhang = te - t0
            
            # fix current day
            df_points['time_end'][ind] = ts.replace(hour=23, minute=59, second=59)
            df_points['idle_time'][ind] = underhang
            df_points['over_night'][ind] = 1
            df_points['idle_time_next_day'][ind] = overhang
            
            # add new row for the next day
            time_start = ts.replace(hour=0, minute=0, second=0)
            time_start += timedelta(days=1)
            flag = 0
            #if time_start + overhang > time_start.replace(hour=23, minute=59, second=59):
            #    flag = 1
            
            dict = {'bike_id': df_points['bike_id'][ind], 'lng': df_points['lng'][ind], 'lat': df_points['lat'][ind],
                    'idle_time': overhang, 'time_start': time_start, 'time_end': te,'over_night': flag, 'idle_time_next_day': np.nan, 'station_number': df_points['station_number'][ind]}
            df_points = df_points.append(dict, ignore_index=True)

print(iterations)

In [None]:
df_points.head()

### idle time in min

In [None]:
def idle_time_to_min(row):
    return row['idle_time'].seconds / 60

df_points['idle_time'] = df_points.apply(idle_time_to_min, axis=1)

### add idle_time_next_day feature

In [None]:
df_points['idle_time_next_day'] = pd.to_timedelta(df_points['idle_time_next_day'])

def idle_time_next_day_to_min(row):
    if row['over_night'] == 1:
        t = row['idle_time_next_day'].total_seconds() / 60
        if t < 1439:
            return t
        else: return 1439
    else:
        return 0

df_points['idle_time_next_day'] = df_points.apply(idle_time_next_day_to_min, axis=1)
df_points.head()

### change type to int

In [None]:
df_points['idle_time'] = df_points['idle_time'].astype(int)
df_points['idle_time_next_day'] = df_points['idle_time_next_day'].astype(int)
df_points

In [None]:
# Check max idle time
df_points['idle_time'].max()

### Add H3 Index

In [None]:
import h3

df_points['hex_id'] = 0

for ind in tqdm(df_points.index):
    df_points['hex_id'][ind] = h3.geo_to_h3(df_points['lat'][ind], df_points['lng'][ind], 8)
df_points.head()

### Add weather Data

In [None]:
dfw = pd.read_csv('../data/weather_data.csv')
dfw['datetime'] = pd.to_datetime(dfw['dt'], unit='s')
dfw = dfw.set_index(['datetime'])
dfw.head()

df_points['temp'] = np.nan
df_points['rain'] = np.nan
df_points['snow'] = np.nan
df_points['wind_speed'] = np.nan
df_points['humidity'] = np.nan

from tqdm import tqdm
pd.options.mode.chained_assignment = None

for ind in tqdm(df_points.index):
    wint = df_points['time_start'][ind].replace(minute=0,second=0)
    df_points['temp'][ind] = dfw['temp'][wint].copy()
    df_points['rain'][ind] = dfw['rain_1h'][wint].copy()
    df_points['snow'][ind] = dfw['snow_1h'][wint].copy()
    df_points['wind_speed'][ind] = dfw['wind_speed'][wint].copy()
    df_points['humidity'][ind] = dfw['humidity'][wint].copy()

df_points

### Fill NaNS weather

In [None]:
df_points['rain'] = df_points['rain'].fillna(0)
df_points['snow'] = df_points['snow'].fillna(0)
df_points['station_number'] = df_points['station_number'].fillna(0)

df_points['wind_speed'] = df_points['wind_speed'].fillna(df_points['wind_speed'].mean())
df_points['humidity'] = df_points['humidity'].fillna(df_points['humidity'].mean())
df_points = df_points.sort_values(by=['time_start']).reset_index(drop=True)

### add unix timestamps

In [None]:
df_points['dt_start'] = pd.to_datetime(df_points['time_start']).map(pd.Timestamp.timestamp).astype(int)
df_points['dt_end'] = pd.to_datetime(df_points['time_end']).map(pd.Timestamp.timestamp).astype(int)
df_points['bike_id'] = df_points['bike_id'].astype(int)
df_points['station_number'] = df_points['station_number'].astype(int)

In [None]:
df_points.head()

### Add start time Feature

In [None]:
def add_start_min_feature(row):
    return ((row['dt_start'] % 86400) / 60).__round__(0)

df_points['start_min'] = df_points.apply(add_start_min_feature,axis=1)
df_points['start_min'] = df_points['start_min'].astype(int)
df_points.head()

### Add day Feature

In [None]:
def add_day_feature(row):
    return row['time_start'].dayofweek

df_points['day'] = df_points.apply(add_day_feature,axis=1)
df_points.head()

### Add month feature

In [None]:
def add_month_feature(row):
    return row['time_start'].strftime("%m")

df_points['month'] = df_points.apply(add_month_feature,axis=1)

### Encode Hex_id

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df_points['hex_enc']= label_encoder.fit_transform(df_points['hex_id'])

### Add year Feature

In [None]:
def add_year_feature(row):
    return row['time_start'].year

df_points['year'] = df_points.apply(add_year_feature,axis=1)

### Add Zone Name Feature

In [None]:
import os
import json
from shapely.geometry import shape, Point

flexzones_0 = []
flexzones_1 = []

def save_flexzones():
    # Flexzone 0 Euro
    directory_name_0 = f'../flexzones/0/'
    for file_name in os.listdir(directory_name_0):
        path = os.path.join(directory_name_0, file_name)
        if os.path.isfile(path):
           with open(path) as f:
                js = json.load(f)
                for feature in js['features']:
                    flexzones_0.append(feature)
    # Flexzone 1 Euro
    directory_name_1 = f'../flexzones/1/'
    for file_name in os.listdir(directory_name_1):
        path = os.path.join(directory_name_1, file_name)
        if os.path.isfile(path):
           with open(path) as f:
                js = json.load(f)
                for feature in js['features']:
                    flexzones_1.append(feature)


def point_in_polygons(row):
    y = row['lat']  # y = 51.331305
    x = row['lng']  # x = 12.344334

    # switch x and y-axis bc. scapely woks on x plane
    point = Point(x,y)

    for feature0 in flexzones_0:
        polygon = shape(feature0['geometry'])
        if polygon.contains(point):
            return feature0['properties']['name']

    for feature1 in flexzones_1:
        polygon = shape(feature1['geometry'])
        if polygon.contains(point):
            return feature1['properties']['name']
    else: return 0

In [None]:
save_flexzones()
df_points['zone_name'] = df_points.apply(point_in_polygons,axis=1)

In [None]:
# add zone name_enc
zone_encoder = preprocessing.LabelEncoder()
df_points['zone_name_enc'] = zone_encoder.fit_transform(df_points['zone_name'])

In [None]:
df_points.head()

### Add in_zone Feature

In [None]:
def in_zone(row):
    if row['zone_name'] != 0:
        return 1
    else: return 0

df_points['in_zone'] = df_points.apply(in_zone,axis=1)

### Add on Station Feature

In [None]:
# add on_station
def on_station(row):
    if row['station_number'] != 0:
        return 1
    else:
        return 0

df_points['on_station'] = df_points.apply(on_station, axis=1)

In [None]:
df_points.head()

In [None]:
df_points = df_points.reindex(columns=['bike_id', 'lat', 'lng', 'dt_start', 'dt_end', 'hex_id','hex_enc', 'in_zone', 'zone_name','zone_name_enc','station_number','on_station','temp', 'rain', 'snow', 'wind_speed', 'humidity', 'year','month', 'day', 'start_min', 'over_night', 'idle_time_next_day', 'idle_time'])

df_points = df_points.sort_values(by=['dt_start'])
df_points.head()

# SAVE

In [None]:
dur = time.time() - ts1
print(dur)
ty_res = time.gmtime(dur)
res = time.strftime("%H:%M:%S",ty_res)
print(res)

In [None]:
df_points.to_csv('../../data/df_points/df_points_2020.csv', index=False)