In [10]:
import pandas as pd
import os

# Import provided data

In [18]:
raw_data = pd.read_csv('/Users/timockenga/Programming/PD_Project/data/bremen.csv')
raw_data.head()

Unnamed: 0.1,Unnamed: 0,p_spot,p_place_type,datetime,b_number,trip,p_uid,p_bikes,p_lat,b_bike_type,p_name,p_number,p_lng,p_bike
0,0,True,0,2019-01-20 02:06:00,11281,first,4774295,5,50.808852,15,Biegenstraße/Cineplex,5155.0,8.773134,False
1,1,True,0,2019-01-20 14:16:00,11281,last,4774295,4,50.808852,15,Biegenstraße/Cineplex,5155.0,8.773134,False
2,2,True,0,2019-01-20 00:00:00,11169,first,4774543,5,50.795224,15,Südbahnhof,5173.0,8.763266,False
3,3,True,0,2019-01-20 01:55:00,11169,start,4774543,5,50.795224,15,Südbahnhof,5173.0,8.763266,False
4,4,True,0,2019-01-20 02:06:00,11169,end,4774368,4,50.804522,15,Frankfurter Straße/Psychologie,5159.0,8.770358,False


In [19]:
raw_data.drop(columns=['Unnamed: 0'], inplace=True)

## Check location data

It is not only Bremen!

In [4]:
print(raw_data.p_lat.min())
print(raw_data.p_lat.max())
print(raw_data.p_lng.min())
print(raw_data.p_lng.max())

-504.153076
8084778.533
-92.561901044101
20.973033333333


## Check sum of null values

In [5]:
raw_data.isna().sum()

p_spot            0
p_place_type      0
datetime          0
b_number          0
trip              0
p_uid             0
p_bikes           0
p_lat             0
b_bike_type       0
p_name            0
p_number        181
p_lng             0
p_bike            0
dtype: int64

# Get geodata of Bremen

https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/bremen.geojson

In [6]:
import folium

In [58]:
map_bremen = folium.Map(location=[53.122962,8.7515937], zoom_start=11)

folium.Choropleth(geo_data=F"/Users/timockenga/Programming/PD_Project/data/bremen.geojson").add_to(map_bremen)

map_bremen

In [8]:
#map_bremen.save('map_bremen.html')

https://automating-gis-processes.github.io/CSC18/lessons/L4/point-in-polygon.html

In [8]:
from shapely.geometry import Point, Polygon

In [9]:
from shapely.ops import cascaded_union

In [10]:
import json

## Save geodata in dictionary

In [11]:
district_polygons = {}

with open('/Users/timockenga/Programming/PD_Project/data/bremen.geojson') as f:
    data = json.load(f)

for feature in data['features']:
    
    district_name = (feature['properties']['name'])
    polygon = Polygon(feature['geometry']['coordinates'][0][0])
    
    district_polygons.update( {district_name : polygon})

In [12]:
print(len(district_polygons))
district_polygons

88


{'Blockland': <shapely.geometry.polygon.Polygon at 0x11ce420f0>,
 'Ohlenhof': <shapely.geometry.polygon.Polygon at 0x11bb086a0>,
 'Utbremen': <shapely.geometry.polygon.Polygon at 0x11bb086d8>,
 'Neustadt': <shapely.geometry.polygon.Polygon at 0x11bb08710>,
 'Neue Vahr Nord': <shapely.geometry.polygon.Polygon at 0x11bb08748>,
 'Fähr-Lobbendorf': <shapely.geometry.polygon.Polygon at 0x11bb08780>,
 'Grohn': <shapely.geometry.polygon.Polygon at 0x11bb087b8>,
 'Rönnebeck': <shapely.geometry.polygon.Polygon at 0x11bb087f0>,
 'Lindenhof': <shapely.geometry.polygon.Polygon at 0x11bb08828>,
 'Gröpelingen': <shapely.geometry.polygon.Polygon at 0x11bb08860>,
 'Oslebshausen': <shapely.geometry.polygon.Polygon at 0x11bb08898>,
 'Weidedamm': <shapely.geometry.polygon.Polygon at 0x11bb088d0>,
 'Ostertor': <shapely.geometry.polygon.Polygon at 0x11bb08908>,
 'Ellener Feld': <shapely.geometry.polygon.Polygon at 0x11bb08940>,
 'Schwachhausen': <shapely.geometry.polygon.Polygon at 0x11bb08978>,
 'Rablingh

## Functions get district of point or false if not in Bremen - not necessary at this point

In [13]:
def getDistrict(latitude, longitude, polygons):

    point = Point(longitude,latitude)
    
    for name, polygon in polygons.items():
        
        if point.within(polygon):
            return name
        
    return False

In [17]:
getDistrict(53.043962,8.9515937,district_polygons)

'Osterholz'

### Not necessary at all because getDistrict also determines if point is inside Bremens boundaries

In [19]:
def pointInBremen2(latitude, longitude, polygons):

    point = Point(longitude,latitude)
    
    bremen = cascaded_union(polygons.values())
    
    if point.within(bremen):
        return True
        
    return False

In [20]:
pointInBremen2(53.021037,8.491593,district_polygons)

False

## Get boundaries (min & max values for latitude & longitude) of Bremen

In [18]:
cascaded_union(district_polygons.values()).bounds

(8.481593, 53.011037, 8.990582, 53.228967)

## Filter raw data exclusively for data points in Bremens boundaries

In [20]:
bremen = raw_data[(raw_data['p_lat'] < 53.228967) &
         (raw_data['p_lat'] > 53.011037) &
         (raw_data['p_lng'] < 8.990582)  &
         (raw_data['p_lng'] > 8.481593)]

print(len(bremen))

1309078


In [22]:
bremen

Unnamed: 0,p_spot,p_place_type,datetime,b_number,trip,p_uid,p_bikes,p_lat,b_bike_type,p_name,p_number,p_lng,p_bike
2241283,False,12,2019-01-20 00:00:00,20668,first,12097754,1,53.081698,71,BIKE 20668,0.0,8.812411,True
2241284,False,12,2019-01-20 01:33:00,20668,last,12097754,1,53.081767,71,BIKE 20668,0.0,8.812408,True
2241285,False,12,2019-01-20 00:00:00,20649,first,12096803,1,53.081533,71,BIKE 20649,0.0,8.790704,True
2241286,True,0,2019-01-20 23:59:00,20649,last,8322444,1,53.082464,71,WeserTower,2971.0,8.789556,False
2241287,False,12,2019-01-20 00:00:00,20964,first,12007715,1,53.100197,71,BIKE 20964,0.0,8.817565,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3585586,False,12,2019-12-31 09:17:00,20905,start,26494106,1,53.088858,71,BIKE 20905,0.0,8.810082,True
3585587,False,12,2019-12-31 09:24:00,20905,end,26501789,1,53.080680,71,BIKE 20905,0.0,8.815383,True
3585588,False,12,2019-12-31 13:55:00,20905,start,26507153,1,53.080884,71,BIKE 20905,0.0,8.815197,True
3585589,False,12,2019-12-31 14:01:00,20905,end,26507979,1,53.082618,71,BIKE 20905,0.0,8.805761,True


In [15]:
bremen = bremen.reset_index().drop(columns=['index'])

# Add district to each row/booking

from tqdm import tqdm

i = 0
for index, row in tqdm(bremen[bremen['district'] == 'Nicht bestimmt'].iterrows()):
    bremen['district'].iloc[index] = getDistrict(row['p_lat'], row['p_lng'], district_polygons)
    if i > 100000:
        bremen.to_csv('bremen_data.csv')
        i = 0
    i+=1
bremen.to_csv('bremen_data.csv', index=False)

## Import data set with district

In [42]:
bremen = pd.read_csv('preprocessed_data.csv')

In [43]:
bremen.sample(2)

Unnamed: 0,p_spot,p_place_type,datetime,b_number,trip,p_uid,p_bikes,p_lat,b_bike_type,p_name,p_number,p_lng,p_bike,district
703761,True,0,2019-02-14 09:40:00,20745,end,10278696,4,53.085032,71,GEWOBA | Wilh.-Liebknecht 1,2987.0,8.879126,False,Neue Vahr Nord
1266840,False,12,2019-12-05 18:31:00,20889,end,25679416,1,53.07292,71,BIKE 20889,0.0,8.82691,True,Steintor


In [44]:
len(bremen)

1309078

# Order data frame usefully

In [45]:
bremen.keys()

Index(['p_spot', 'p_place_type', 'datetime', 'b_number', 'trip', 'p_uid',
       'p_bikes', 'p_lat', 'b_bike_type', 'p_name', 'p_number', 'p_lng',
       'p_bike', 'district'],
      dtype='object')

In [46]:
bremen = bremen[['datetime', 'b_number', 'p_spot', 'p_place_type',
                 'trip', 'p_uid', 'p_bikes', 'b_bike_type', 'p_name',
                 'p_number', 'p_bike', 'p_lat', 'p_lng', 'district']]

#bremen.to_csv('bremen_data.csv', index=False)

# Exploration

## Total number of bookings

In [24]:
len(bremen)

1309078

## Total number of null values

In [23]:
bremen.isna().sum()

p_spot           0
p_place_type     0
datetime         0
b_number         0
trip             0
p_uid            0
p_bikes          0
p_lat            0
b_bike_type      0
p_name           0
p_number        88
p_lng            0
p_bike           0
dtype: int64

## Total number of bikes - 443

In [49]:
print(len(bremen.b_number.unique()))
print(bremen.b_number.unique()[:10])

443
[20668 20649 20964 20866 20799 20676 20855 20707 20815 20891]


### For a specific bike check trips - seem to be many duplicates!

In [50]:
bremen[bremen.b_number == 20668].sort_values('datetime')[:20]

Unnamed: 0,datetime,b_number,p_spot,p_place_type,trip,p_uid,p_bikes,b_bike_type,p_name,p_number,p_bike,p_lat,p_lng,district
0,2019-01-20 00:00:00,20668,False,12,first,12097754,1,71,BIKE 20668,0.0,True,53.081698,8.812411,Bahnhofsvorstadt
654539,2019-01-20 00:00:00,20668,False,12,first,12097754,1,71,BIKE 20668,0.0,True,53.081698,8.812411,Bahnhofsvorstadt
1,2019-01-20 01:33:00,20668,False,12,last,12097754,1,71,BIKE 20668,0.0,True,53.081767,8.812408,Bahnhofsvorstadt
654540,2019-01-20 01:33:00,20668,False,12,last,12097754,1,71,BIKE 20668,0.0,True,53.081767,8.812408,Bahnhofsvorstadt
1583,2019-01-22 09:15:00,20668,True,0,first,7873316,5,71,Hauptbahnhof / Übersee Museum,2946.0,False,53.083167,8.811472,Bahnhofsvorstadt
656122,2019-01-22 09:15:00,20668,True,0,first,7873316,5,71,Hauptbahnhof / Übersee Museum,2946.0,False,53.083167,8.811472,Bahnhofsvorstadt
1584,2019-01-22 09:42:00,20668,True,0,start,7873316,5,71,Hauptbahnhof / Übersee Museum,2946.0,False,53.083167,8.811472,Bahnhofsvorstadt
656123,2019-01-22 09:42:00,20668,True,0,start,7873316,5,71,Hauptbahnhof / Übersee Museum,2946.0,False,53.083167,8.811472,Bahnhofsvorstadt
1585,2019-01-22 09:49:00,20668,False,12,end,12155136,1,71,BIKE 20668,0.0,True,53.080356,8.795718,Altstadt
656124,2019-01-22 09:49:00,20668,False,12,end,12155136,1,71,BIKE 20668,0.0,True,53.080356,8.795718,Altstadt


## Check for duplicates

In [51]:
bremen.duplicated(subset=['datetime', 'b_number','trip']).sum()

654539

In [52]:
bremen.duplicated(subset=None).sum()

615607

In [53]:
654539 - 615607

38932

# TODO check datetime & b_number duplicates against subset=None

### Half of data are duplicates

In [54]:
654539*2

1309078

# Drop duplicates

In [25]:
bremen = bremen[bremen.duplicated(subset=['datetime', 'b_number'], keep='first') == False]

In [26]:
len(bremen)

654539

In [28]:
bremen.isna().sum()

p_spot           0
p_place_type     0
datetime         0
b_number         0
trip             0
p_uid            0
p_bikes          0
p_lat            0
b_bike_type      0
p_name           0
p_number        44
p_lng            0
p_bike           0
dtype: int64

## Specifications of trip - start/end point - first and last  do not refer to duplicates

In [61]:
bremen.trip.unique()

array(['first', 'last', 'start', 'end'], dtype=object)

In [62]:
len(bremen[(bremen.trip != 'first') & (bremen.trip != 'last')])

462455

In [65]:
bremen[bremen.b_number == 20668].sort_values('datetime')[:20]

In [73]:
bremen['datetime'] = pd.to_datetime(bremen['datetime'])

In [83]:
bremen[(bremen.b_number == 20668) & ((bremen.trip == 'first') & (bremen.datetime.dt.hour != 0))].sort_values('datetime')

Unnamed: 0,datetime,b_number,p_spot,p_place_type,trip,p_uid,p_bikes,b_bike_type,p_name,p_number,p_bike,p_lat,p_lng,district
1583,2019-01-22 09:15:00,20668,True,0,first,7873316,5,71,Hauptbahnhof / Übersee Museum,2946.0,False,53.083167,8.811472,Bahnhofsvorstadt
4960,2019-01-25 07:46:00,20668,True,0,first,7873007,4,71,Domsheide,2933.0,False,53.074592,8.810182,Altstadt
22427,2019-02-11 13:24:00,20668,True,0,first,7873893,2,71,Woltmershauser Str.,2966.0,False,53.078528,8.779167,Woltmershausen
66599,2019-02-28 14:19:00,20668,False,12,first,13051288,1,71,BIKE 20668,0.0,True,53.081902,8.810073,Bahnhofsvorstadt
67889,2019-03-01 16:45:00,20668,False,12,first,13075883,1,71,BIKE 20668,0.0,True,53.070702,8.830958,Steintor
101708,2019-03-19 09:28:00,20668,False,12,first,13428348,1,71,BIKE 20668,0.0,True,53.078944,8.848838,Gete
109350,2019-03-25 10:56:00,20668,True,0,first,8638942,4,71,Föhrenstraße,2972.0,False,53.064904,8.879091,Hastedt
122190,2019-04-01 06:33:00,20668,False,12,first,13917223,1,71,Hamburger Straße 248,0.0,True,53.068182,8.849903,Peterswerder
192569,2019-04-30 08:29:00,20668,True,0,first,7873316,5,71,Hauptbahnhof / Übersee Museum,2946.0,False,53.083167,8.811472,Bahnhofsvorstadt
196247,2019-05-01 03:51:00,20668,True,0,first,7873316,5,71,Hauptbahnhof / Übersee Museum,2946.0,False,53.083167,8.811472,Bahnhofsvorstadt


## Specifications of p_bikes - # of available bikes at position/station

In [86]:
bremen.p_bikes.unique()

array([ 1,  2,  3,  5,  4,  0,  7,  6,  8,  9, 11, 10, 12, 15, 14, 13, 16,
       17, 31, 30, 32, 28, 20, 25, 26, 18, 23, 21, 27, 22, 29, 19])

In [89]:
bremen[bremen.p_spot == False].p_bikes.max()

2

## Specifications of b_bike_type - two different types of bikes?

In [25]:
bremen.b_bike_type.unique()

array([71, 29])

## Specifications of p_bike - if True not in official area - opposite of p_spot

In [24]:
bremen.p_bike.unique()


array([ True, False])

## Specifications of p_spot - is inside official station area - if True there is a p_name - opposite of p_bike

In [30]:
bremen.p_spot.unique()

array([False,  True])

## Total number of p_uid - ids of official locations - related to p_name

In [36]:
len(bremen.p_uid.unique())

157164

In [92]:
len(bremen[bremen.p_spot == True].p_uid.unique())

76

In [93]:
for p_uid in bremen[bremen.p_spot == True].p_uid.unique():
    print(p_uid)
    print(bremen[bremen.p_uid == p_uid].district.unique())

8322444
['Überseestadt' 'Woltmershausen']
7873706
['Woltmershausen']
10278716
['Neue Vahr Südost']
7872739
['Blumenthal']
2351602
['Altstadt']
7873007
['Altstadt']
8638942
['Hastedt']
7873840
['Lehe']
7873210
['Neustadt']
7873146
['Altstadt']
7873421
['Grohn']
11119973
['Überseestadt']
8806546
['Bürgerpark']
10278649
['Neue Vahr Südwest']
7872765
['Farge']
10084718
['Lehe']
8615035
['False']
7872789
['Mahndorf']
8322400
['Überseestadt']
7873118
['Gartenstadt Süd']
10278671
['Neue Vahr Nord']
7873316
['Bahnhofsvorstadt']
7873483
['Neustadt']
7873825
['Lehe']
8172362
['Altstadt']
8740854
['Kattenturm']
7873893
['Woltmershausen']
7873338
['Vegesack']
8028225
['Neuenland']
7873255
['False']
7873189
['Neu-Schwachhausen']
7873297
['Hastedt']
8903474
['False']
7873805
['Alte Neustadt']
7873781
['Gete']
8322336
['Altstadt']
10278696
['Neue Vahr Nord']
7873131
['Kattenturm']
7872920
['Sebaldsbrück']
7873874
['Bürgerpark']
7873026
['Weidedamm']
7873660
['Bahnhofsvorstadt']
7873436
['Habenhausen'

# TODO check with station data set

## Specifications of p_name

In [96]:
bremen.p_name.nunique()

5975

In [108]:
bremen[bremen.p_name.str[:4] != 'BIKE'].p_name.unique()

array(['WeserTower', 'Pusdorfer Marktplatz', 'GEWOBA | Ludwig-Beck 2a',
       ..., ' Faulenstraße 65', '53.082872, 8.812058',
       'recording_91000056'], dtype=object)

# TODO fix names

## Total number of district - there are officially 88

In [27]:
len(bremen.district.unique())

87

## Total number of p_number - stationIds numbers

In [28]:
print(len(bremen.p_number.unique()))
print(bremen.p_number.unique())

80
[    0.  2971.  2959.  2988.  2927.  2910.  2933.  2972.  2912.  2941.
  2938.  2949.  2913.  2976.  2985.  2928. 29860.  2953.  2929.  2970.
  2936.  2986.  2946.  2952.  2963.  2967.  2975.  2966.  2947.  2958.
  2943.  2940.  2945.  2978.  2962.  2961.  2969.  2987.  2937.  2930.
  2965.  2934.  2957.  2950.  2939.  2942.  2911.  2951.  2955.  2925.
  2990.  2932.  2931.  2926.  2948.  2944.  2992.  2956.  2954.  2991.
  2935.  2914.  2997.  2977.  2964.  2974.  2960.  2973.  2968.  2989.
  2915.    nan  2916.  2917.  2919.  2918.  2920.  2923.  2922.  2921.]


### For each p_number check all related districts

In [29]:
for p_number in bremen.p_number.unique():
    print(p_number)
    print(bremen[bremen.p_number == p_number].district.unique())

0.0
['Bahnhofsvorstadt' 'Altstadt' 'Weidedamm' 'Lehe' 'Gartenstadt Süd'
 'Neustadt' 'Peterswerder' 'Gete' 'Südervorstadt' 'Fesenfeld' 'Steintor'
 'Neue Vahr Südwest' 'Neue Vahr Nord' 'Bürgerpark' 'Findorff-Bürgerweide'
 'Sebaldsbrück' 'Buntentor' 'Hohentor' 'Alte Neustadt' 'Kattenturm'
 'Barkhof' 'Riensberg' 'Radio Bremen' 'Überseestadt' 'Ostertor'
 'Schwachhausen' 'Hastedt' 'Gartenstadt Vahr' 'Hohentorshafen'
 'Regensburger Straße' 'Huckelriede' 'Steffensweg' 'Neu-Schwachhausen'
 'Neue Vahr Südost' 'Horn' 'Oberneuland' 'Woltmershausen' 'Hulsberg'
 'Hemelingen' 'False' 'Neuenland' 'Utbremen' 'Vegesack' 'Blumenthal'
 'Grohn' 'Westend' 'Hohweg' 'Farge' 'Walle' 'Lehesterdeich' 'Habenhausen'
 'Schönebeck' 'In den Hufen' 'Industriehäfen' 'Grolland' 'Osterfeuerberg'
 'Borgfeld' 'St. Magnus' 'Rablinghausen' 'Ellenerbrok-Schevemoor'
 'Blockland' 'Blockdiek' 'Burg-Grambke' 'Kirchhuchting' 'Lesum'
 'Osterholz' 'Arsten' 'Sodenmatt' 'Tenever' 'Gröpelingen' 'Lindenhof'
 'Kattenesch' 'Arbergen' 'Ell

### Conclusion:

p_number: station_id, p_uid: nextBike districts?

# Plot some data points

Check if possible that if too many points only one cluster point displayed

In [109]:
from folium.plugins import MarkerCluster, FastMarkerCluster

In [110]:
from tqdm import tqdm

In [111]:
def plotDataPoints(latitudes, longitudes, color, fill_color, zoom):
    
    # Create map centered at Bremens center
    m = folium.Map(location=[53.122962,8.7515937], zoom_start=zoom)
    
    marker_cluster = MarkerCluster().add_to(m)
    #marker_cluster = FastMarkerCluster(data=list(zip(latitudes, longitudes))).add_to(m)
        
    if len(latitudes) != len(longitudes):
        print('latitudes & longitudes must be the same size!')
        
    info = 'No information available'
    
    for i in tqdm(range(len(latitudes))):
        
        folium.Circle(
            location=[latitudes[i],longitudes[i]],
            popup=folium.Popup(info, max_width = 450),
            radius=15,
            color=color,
            fill=True,
            fill_color=fill_color
        ).add_to(marker_cluster)
                  
    return m

In [112]:
plotDataPoints(bremen['p_lat'][:1500], bremen['p_lng'][:1500], color='royalblue', fill_color='indianred', zoom=11)
#.save('bremen_clustered.html')

100%|██████████| 1500/1500 [00:00<00:00, 4890.25it/s]


In [59]:
folium.GeoJson(F"/Users/timockenga/Downloads/map.geojson").add_to(map_bremen)

map_bremen

https://www.wk-bike.de/de/bremen/standorte/

https://gbfs.nextbike.net/maps/gbfs/v1/nextbike_wk/de/station_information.json