In [3]:
import json
import ast
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#import matplotlib as mpl
#import folium
#from folium.plugins import MarkerCluster
import datetime

In [60]:
def get_data(n=20000):
    '''
    Gets data from a csv and cleans it
    
    '''
    
    
    df = pd.read_csv('bird_data-nov19.csv')
    df.drop(['code', 'captive', 'battery_level', 'location_group'], axis=1, inplace=True)
    
    #For testing so that everything will run quickly
    df = df.head(n)
    
    #Remove repeated data
    df['time_group_seconds'] =(pd.to_datetime(df['time_group']) - datetime.datetime(1970,1,1)).dt.total_seconds()
    df = drop_repeated_data(df)

    #Add columns for lat, long, count, and grid_location
    
    df = add_lat_long(df)
    df['count'] = 1
    df['grid_location'] = 0

    
    #Applies a location based on a grid over Oakland to each scooter
    df = add_grid_location(df)
    og_df = df
    og_df = add_day_of_week(og_df)
    og_df = add_rounded_time(og_df)
    
    #Reforms dataframe to calculate count of scooters in each grid location every 15 min
    df = grid_count(df)
    
    #Adds a column with the rounded time for time of the day in seconds, and day of the week, and date
    df = add_rounded_time(df)    
    df = add_day_of_week(df)
    
    
    
    return df, og_df

In [5]:
def add_lat_long(df):
    
    df["location"] = df.location.str.replace("'", "\"").map( lambda x: json.loads(x) )
    
    df["latitude"] = df["location"].map( lambda x:x["latitude"] )
    df["longitude"] = df["location"].map( lambda x:x["longitude"] )
    
    df.drop(['location'], axis=1, inplace=True)

    df['latitude'] = df['latitude'].round(5)
    df['longitude'] = df['longitude'].round(5)
    
    return df

In [6]:
def drop_repeated_data(df):
    '''
    Removes repeated data based on id and rounded_time - this should cut the data down by more than half
    '''
    
    df.drop_duplicates(subset=['id','time_group_seconds'], keep='first', inplace=True)
    return df

In [8]:
def add_grid_location(df, n=42):
    
    '''
    Assigns each data point to a location on the grid according to its lat/long
    '''
    
    top_right = [df['latitude'].max(), df['longitude'].max()]
    top_left = [df['latitude'].min(), df['longitude'].min()]
    
    grid = get_geojson_grid(top_right, top_left, n)
    
    for i, box in enumerate(grid):
        upper_right = box["properties"]["upper_right"]
        lower_left = box["properties"]["lower_left"]
    
        mask = (
            (df.latitude <= upper_right[1]) & (df.latitude >= lower_left[1]) &
            (df.longitude <= upper_right[0]) & (df.longitude >= lower_left[0])
           )
    
        column_name = 'grid_location'
        df.loc[mask, column_name] = i
    
    return df

In [9]:
def get_geojson_grid(upper_right, lower_left, n=6):
    """Returns a grid of geojson rectangles, and computes the exposure in each section of the grid based on the vessel data.

    Parameters
    ----------
    upper_right: array_like
        The upper right hand corner of "grid of grids" (the default is the upper right hand [lat, lon] of the USA).

    lower_left: array_like
        The lower left hand corner of "grid of grids"  (the default is the lower left hand [lat, lon] of the USA).

    n: integer
        The number of rows/columns in the (n,n) grid.

    Returns
    -------

    list
        List of "geojson style" dictionary objects   
    """

    all_boxes = []

    lat_steps = np.linspace(lower_left[0], upper_right[0], n+1)
    lon_steps = np.linspace(lower_left[1], upper_right[1], n+1)

    lat_stride = lat_steps[1] - lat_steps[0]
    lon_stride = lon_steps[1] - lon_steps[0]

    for lat in lat_steps[:-1]:
        for lon in lon_steps[:-1]:
            # Define dimensions of box in grid
            upper_left = [lon, lat + lat_stride]
            upper_right = [lon + lon_stride, lat + lat_stride]
            lower_right = [lon + lon_stride, lat]
            lower_left = [lon, lat]

            # Define json coordinates for polygon
            coordinates = [
                upper_left,
                upper_right,
                lower_right,
                lower_left,
                upper_left
            ]

            geo_json = {"type": "FeatureCollection",
                        "properties":{
                            "lower_left": lower_left,
                            "upper_right": upper_right
                        },
                        "features":[]}

            grid_feature = {
                "type":"Feature",
                "geometry":{
                    "type":"Polygon",
                    "coordinates": [coordinates],
                }
            }

            geo_json["features"].append(grid_feature)

            all_boxes.append(geo_json)

    return all_boxes

In [45]:
def grid_count(df, n=42):
    '''
    For a given time, day of the week create df of number of scooters in each grid location
    '''
    
    #COMBINE UNIQUE DATE AND UNIQUE ROUNDED TIME INTO 1 LOOP INSTEAD OF 2
    
    new_df = pd.DataFrame()
    for unique_rounded_time in df['time_group_seconds'].unique():
        for unique_grid_location in range(n**2):
            if ((df['time_group_seconds'] == unique_rounded_time) & (df['grid_location'] == unique_grid_location)).any():
                continue
            else:
                #print (unique_grid_location)
                new_df = new_df.append({'time_group_seconds':unique_rounded_time, 'grid_location':unique_grid_location}, ignore_index=True)
    new_df['grid_location'] = new_df['grid_location'].astype(int)                
    
    df = df.groupby(['time_group_seconds', 'grid_location']).size().reset_index(name='counts')
    df3 = pd.concat([df,new_df], sort=False)
    df3.fillna(value=0, inplace=True)
    df3.sort_values(by=['grid_location', 'time_group_seconds'], inplace=True)
    #for a given date and rounded time - check to see if there is a grid location, if not set count to 0.
    
    #df = df.groupby(['grid_location']).agg(['count'])
    return df3

In [29]:
def add_rounded_time(df, interval=15):
    '''
    Adds a column with the rounded time to the interval specified.
    
    '''
    df['time_of_day'] = pd.to_datetime(df['time_group_seconds'], unit='s').dt.round('15min')  

    df_time = pd.to_datetime(df['time_of_day'])

    df['time_of_day'] = df_time.dt.hour*3600+df_time.dt.minute*60 + df_time.dt.second
    
    
    return df

In [35]:
def add_day_of_week(df):
    df['date'] = pd.to_datetime(df['time_group_seconds'], unit='s').dt.round("D")
    df['day_of_week'] = df['date'].dt.day_name()
    
    return df

In [61]:
df, og_df = get_data()

In [90]:
df['time_group_seconds'].unique()

array([1.54204878e+09, 1.54204924e+09, 1.54204969e+09])

In [30]:
df = add_rounded_time(df)

In [33]:
df.drop('rounded_time', axis=1, inplace=True)

In [36]:
df = add_day_of_week(df)

In [73]:
def add_id_list(df, og_df):

    g = og_df.groupby(['grid_location', 'date', 'time_of_day'])['id'].apply(list).reset_index(name='id_list')

    merger_df = pd.merge(df, g, on=['grid_location', 'date', 'time_of_day'], how='outer')
    
    isnull = merger_df.id_list.isnull()

    merger_df.loc[isnull, 'id_list'] = [ [[]] * isnull.sum() ]
    
    return merger_df

In [None]:
def add_idle_and_turnover(df):
    group1 = merger_df.iloc[:-1]
    group1.reset_index(inplace=True)
    group2 = merger_df.iloc[1:]
    group2.reset_index(inplace=True)




In [74]:
new_df = add_id_list(df, og_df)

In [77]:
new_df[new_df['counts']>0]

Unnamed: 0,time_group_seconds,grid_location,counts,time_of_day,date,day_of_week,id_list
96,1.542049e+09,32,5.0,68400,2018-11-13,Tuesday,"[fa40ae12-4959-4c48-9c00-bab241897a1f, c5aed23..."
97,1.542049e+09,32,5.0,68400,2018-11-13,Tuesday,"[fa40ae12-4959-4c48-9c00-bab241897a1f, c5aed23..."
114,1.542049e+09,38,1.0,68400,2018-11-13,Tuesday,"[77e065d4-451d-4443-b456-3d00e052e74c, 77e065d..."
115,1.542049e+09,38,1.0,68400,2018-11-13,Tuesday,"[77e065d4-451d-4443-b456-3d00e052e74c, 77e065d..."
231,1.542049e+09,77,2.0,68400,2018-11-13,Tuesday,"[5fe61c5f-473e-4cf4-a57a-9b9ee13ca3ee, ca8dafc..."
232,1.542049e+09,77,2.0,68400,2018-11-13,Tuesday,"[5fe61c5f-473e-4cf4-a57a-9b9ee13ca3ee, ca8dafc..."
237,1.542049e+09,79,1.0,68400,2018-11-13,Tuesday,"[b118936d-e87a-4601-927a-833455396cc7, b118936..."
238,1.542049e+09,79,1.0,68400,2018-11-13,Tuesday,"[b118936d-e87a-4601-927a-833455396cc7, b118936..."
333,1.542049e+09,111,1.0,68400,2018-11-13,Tuesday,"[f104f203-94c8-4425-a569-56ce39ace569, f104f20..."
334,1.542049e+09,111,1.0,68400,2018-11-13,Tuesday,"[f104f203-94c8-4425-a569-56ce39ace569, f104f20..."
