In [1]:
import pandas as pd
import os

# Import provided data

In [2]:
raw_data = pd.read_csv('../data/raw/bremen.csv')
raw_data.head()


Unnamed: 0.1,Unnamed: 0,p_spot,p_place_type,datetime,b_number,trip,p_uid,p_bikes,p_lat,b_bike_type,p_name,p_number,p_lng,p_bike
0,0,True,0,2019-01-20 02:06:00,11281,first,4774295,5,50.808852,15,Biegenstraße/Cineplex,5155.0,8.773134,False
1,1,True,0,2019-01-20 14:16:00,11281,last,4774295,4,50.808852,15,Biegenstraße/Cineplex,5155.0,8.773134,False
2,2,True,0,2019-01-20 00:00:00,11169,first,4774543,5,50.795224,15,Südbahnhof,5173.0,8.763266,False
3,3,True,0,2019-01-20 01:55:00,11169,start,4774543,5,50.795224,15,Südbahnhof,5173.0,8.763266,False
4,4,True,0,2019-01-20 02:06:00,11169,end,4774368,4,50.804522,15,Frankfurter Straße/Psychologie,5159.0,8.770358,False


In [3]:
raw_data.drop(columns=['Unnamed: 0'], inplace=True)

## Check location data

It is not only Bremen!

In [4]:
print(raw_data.p_lat.min())
print(raw_data.p_lat.max())
print(raw_data.p_lng.min())
print(raw_data.p_lng.max())

-504.153076
8084778.533
-92.561901044101
20.973033333333


In [5]:
raw_data.dropna(inplace=True)

## Check sum of null values

In [6]:
raw_data.isna().sum()

p_spot          0
p_place_type    0
datetime        0
b_number        0
trip            0
p_uid           0
p_bikes         0
p_lat           0
b_bike_type     0
p_name          0
p_number        0
p_lng           0
p_bike          0
dtype: int64

# Get geodata of Bremen

https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/bremen.geojson

In [7]:
import folium

In [8]:
map_bremen = folium.Map(location=[53.122962,8.7515937], zoom_start=11)

folium.Choropleth(geo_data=F"/Users/timockenga/Programming/PD_Project/data/bremen.geojson").add_to(map_bremen)

map_bremen

In [None]:
#map_bremen.save('map_bremen.html')

https://automating-gis-processes.github.io/CSC18/lessons/L4/point-in-polygon.html

In [None]:
from shapely.geometry import Point, Polygon

In [None]:
from shapely.ops import cascaded_union

In [None]:
import json

## Save geodata in dictionary

In [None]:
district_polygons = {}

with open('/Users/timockenga/Programming/PD_Project/data/bremen.geojson') as f:
    data = json.load(f)

for feature in data['features']:
    
    district_name = (feature['properties']['name'])
    polygon = Polygon(feature['geometry']['coordinates'][0][0])
    
    district_polygons.update( {district_name : polygon})

In [None]:
print(len(district_polygons))
district_polygons

## Functions get district of point or false if not in Bremen - not necessary at this point

In [None]:
def getDistrict(latitude, longitude, polygons):

    point = Point(longitude,latitude)
    
    for name, polygon in polygons.items():
        
        if point.within(polygon):
            return name
        
    return False

In [None]:
getDistrict(53.043962,8.9515937,district_polygons)

### Not necessary at all because getDistrict also determines if point is inside Bremens boundaries

In [None]:
def pointInBremen2(latitude, longitude, polygons):

    point = Point(longitude,latitude)
    
    bremen = cascaded_union(polygons.values())
    
    if point.within(bremen):
        return True
        
    return False

In [None]:
pointInBremen2(53.021037,8.491593,district_polygons)

## Get boundaries (min & max values for latitude & longitude) of Bremen

In [None]:
cascaded_union(district_polygons.values()).bounds

## Filter raw data exclusively for data points in Bremens boundaries

In [13]:
bremen = raw_data[(raw_data['p_lat'] < 53.228967) &
         (raw_data['p_lat'] > 53.011037) &
         (raw_data['p_lng'] < 8.990582)  &
         (raw_data['p_lng'] > 8.481593)]

print(len(bremen))

1308990


In [14]:
bremen

Unnamed: 0,p_spot,p_place_type,datetime,b_number,trip,p_uid,p_bikes,p_lat,b_bike_type,p_name,p_number,p_lng,p_bike
2241283,False,12,2019-01-20 00:00:00,20668,first,12097754,1,53.081698,71,BIKE 20668,0.0,8.812411,True
2241284,False,12,2019-01-20 01:33:00,20668,last,12097754,1,53.081767,71,BIKE 20668,0.0,8.812408,True
2241285,False,12,2019-01-20 00:00:00,20649,first,12096803,1,53.081533,71,BIKE 20649,0.0,8.790704,True
2241286,True,0,2019-01-20 23:59:00,20649,last,8322444,1,53.082464,71,WeserTower,2971.0,8.789556,False
2241287,False,12,2019-01-20 00:00:00,20964,first,12007715,1,53.100197,71,BIKE 20964,0.0,8.817565,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3585586,False,12,2019-12-31 09:17:00,20905,start,26494106,1,53.088858,71,BIKE 20905,0.0,8.810082,True
3585587,False,12,2019-12-31 09:24:00,20905,end,26501789,1,53.080680,71,BIKE 20905,0.0,8.815383,True
3585588,False,12,2019-12-31 13:55:00,20905,start,26507153,1,53.080884,71,BIKE 20905,0.0,8.815197,True
3585589,False,12,2019-12-31 14:01:00,20905,end,26507979,1,53.082618,71,BIKE 20905,0.0,8.805761,True


In [15]:
bremen = bremen.reset_index().drop(columns=['index'])

# Add district to each row/booking

from tqdm import tqdm

i = 0
for index, row in tqdm(bremen[bremen['district'] == 'Nicht bestimmt'].iterrows()):
    bremen['district'].iloc[index] = getDistrict(row['p_lat'], row['p_lng'], district_polygons)
    if i > 100000:
        bremen.to_csv('bremen_data.csv')
        i = 0
    i+=1
bremen.to_csv('bremen_data.csv', index=False)

## Import data set with district

In [9]:
bremen = pd.read_csv('preprocessed_data.csv')

FileNotFoundError: [Errno 2] File preprocessed_data.csv does not exist: 'preprocessed_data.csv'

In [None]:
bremen.sample(2)

In [None]:
len(bremen)

# Order data frame usefully

In [None]:
bremen.keys()

In [16]:
bremen = bremen[['datetime', 'b_number', 'p_spot', 'p_place_type',
                 'trip', 'p_uid', 'p_bikes', 'b_bike_type', 'p_name',
                 'p_number', 'p_bike', 'p_lat', 'p_lng', 'district']]

#bremen.to_csv('bremen_data.csv', index=False)

KeyError: "['district'] not in index"

# Exploration

## Total number of bookings

In [None]:
len(bremen)

## Total number of null values

In [17]:
bremen.isna().sum()

p_spot          0
p_place_type    0
datetime        0
b_number        0
trip            0
p_uid           0
p_bikes         0
p_lat           0
b_bike_type     0
p_name          0
p_number        0
p_lng           0
p_bike          0
dtype: int64

## Total number of bikes - 443

In [18]:
print(len(bremen.b_number.unique()))
print(bremen.b_number.unique()[:10])

443
[20668 20649 20964 20866 20799 20676 20855 20707 20815 20891]


### For a specific bike check trips - seem to be many duplicates!

In [19]:
bremen[bremen.b_number == 20668].sort_values('datetime')[:20]

Unnamed: 0,p_spot,p_place_type,datetime,b_number,trip,p_uid,p_bikes,p_lat,b_bike_type,p_name,p_number,p_lng,p_bike
0,False,12,2019-01-20 00:00:00,20668,first,12097754,1,53.081698,71,BIKE 20668,0.0,8.812411,True
654495,False,12,2019-01-20 00:00:00,20668,first,12097754,1,53.081698,71,BIKE 20668,0.0,8.812411,True
1,False,12,2019-01-20 01:33:00,20668,last,12097754,1,53.081767,71,BIKE 20668,0.0,8.812408,True
654496,False,12,2019-01-20 01:33:00,20668,last,12097754,1,53.081767,71,BIKE 20668,0.0,8.812408,True
1583,True,0,2019-01-22 09:15:00,20668,first,7873316,5,53.083167,71,Hauptbahnhof / Übersee Museum,2946.0,8.811472,False
656078,True,0,2019-01-22 09:15:00,20668,first,7873316,5,53.083167,71,Hauptbahnhof / Übersee Museum,2946.0,8.811472,False
1584,True,0,2019-01-22 09:42:00,20668,start,7873316,5,53.083167,71,Hauptbahnhof / Übersee Museum,2946.0,8.811472,False
656079,True,0,2019-01-22 09:42:00,20668,start,7873316,5,53.083167,71,Hauptbahnhof / Übersee Museum,2946.0,8.811472,False
1585,False,12,2019-01-22 09:49:00,20668,end,12155136,1,53.080356,71,BIKE 20668,0.0,8.795718,True
656080,False,12,2019-01-22 09:49:00,20668,end,12155136,1,53.080356,71,BIKE 20668,0.0,8.795718,True


## Check for duplicates

In [None]:
bremen.duplicated(subset=['datetime', 'b_number','trip']).sum()

In [None]:
bremen.duplicated(subset=None).sum()

In [None]:
654539 - 615607

# TODO check datetime & b_number duplicates against subset=None

### Half of data are duplicates

In [None]:
654539*2

# Drop duplicates

In [None]:
bremen = bremen[bremen.duplicated(subset=['datetime', 'b_number'], keep='first') == False]

In [None]:
len(bremen)

In [None]:
bremen.isna().sum()

## Specifications of trip - start/end point - first and last  do not refer to duplicates

In [None]:
bremen.trip.unique()

In [None]:
len(bremen[(bremen.trip != 'first') & (bremen.trip != 'last')])

In [None]:
bremen[bremen.b_number == 20668].sort_values('datetime')[:20]

In [None]:
bremen['datetime'] = pd.to_datetime(bremen['datetime'])

In [None]:
bremen[(bremen.b_number == 20668) & ((bremen.trip == 'first') & (bremen.datetime.dt.hour != 0))].sort_values('datetime')

## Specifications of p_bikes - # of available bikes at position/station

In [20]:
bremen.p_bikes.unique()

array([ 1,  2,  3,  5,  4,  0,  7,  6,  8,  9, 11, 10, 12, 15, 14, 13, 16,
       17, 31, 30, 32, 28, 20, 25, 26, 18, 23, 21, 27, 22, 29, 19])

In [21]:
bremen[bremen.p_spot == False].p_bikes.max()

2

## Specifications of b_bike_type - two different types of bikes?

In [7]:
bremen.b_bike_type.unique()

NameError: name 'bremen' is not defined

## Specifications of p_bike - if True not in official area - opposite of p_spot

In [None]:
bremen.p_bike.unique()


## Specifications of p_spot - is inside official station area - if True there is a p_name - opposite of p_bike

In [None]:
bremen.p_spot.unique()

## Total number of p_uid - ids of official locations - related to p_name

In [None]:
len(bremen.p_uid.unique())

In [None]:
len(bremen[bremen.p_spot == True].p_uid.unique())

In [None]:
for p_uid in bremen[bremen.p_spot == True].p_uid.unique():
    print(p_uid)
    print(bremen[bremen.p_uid == p_uid].district.unique())

# TODO check with station data set

## Specifications of p_name

In [None]:
bremen.p_name.nunique()

In [None]:
bremen[bremen.p_name.str[:4] != 'BIKE'].p_name.unique()

# TODO fix names

## Total number of district - there are officially 88

In [None]:
len(bremen.district.unique())

## Total number of p_number - stationIds numbers

In [None]:
print(len(bremen.p_number.unique()))
print(bremen.p_number.unique())

### For each p_number check all related districts

In [None]:
for p_number in bremen.p_number.unique():
    print(p_number)
    print(bremen[bremen.p_number == p_number].district.unique())

### Conclusion:

p_number: station_id, p_uid: nextBike districts?

# Plot some data points

Check if possible that if too many points only one cluster point displayed

In [None]:
from folium.plugins import MarkerCluster, FastMarkerCluster

In [None]:
from tqdm import tqdm

In [None]:
def plotDataPoints(latitudes, longitudes, color, fill_color, zoom):
    
    # Create map centered at Bremens center
    m = folium.Map(location=[53.122962,8.7515937], zoom_start=zoom)
    
    marker_cluster = MarkerCluster().add_to(m)
        
    if len(latitudes) != len(longitudes):
        print('latitudes & longitudes must be the same size!')
        
    info = 'No information available'
    
    for i in tqdm(range(len(latitudes))):
        
        folium.Circle(
            location=[latitudes[i],longitudes[i]],
            popup=folium.Popup(info, max_width = 450),
            radius=15,
            color=color,
            fill=True,
            fill_color=fill_color
        ).add_to(marker_cluster)
                  
    return m

In [None]:
plotDataPoints(bremen['p_lat'][:1500], bremen['p_lng'][:1500], color='royalblue', fill_color='indianred', zoom=11)
#.save('bremen_clustered.html')

In [None]:
folium.GeoJson(F"/Users/timockenga/Downloads/map.geojson").add_to(map_bremen)

map_bremen

https://www.wk-bike.de/de/bremen/standorte/

https://gbfs.nextbike.net/maps/gbfs/v1/nextbike_wk/de/station_information.json