In [122]:
import json
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import folium
from folium.plugins import MarkerCluster
import datetime

In [149]:
def get_data():

    df = pd.read_csv('bird_data.csv')
    df.drop(['code', 'captive'], axis=1, inplace=True)

    temp_df = df

    new_df = add_lat_long(temp_df)
    new_df['count'] = 1
    new_df['grid_location'] = 0
    new_df.drop(['location'], axis=1, inplace=True)
    return new_df

In [124]:
def add_lat_long(df):
    loc_array = df['location']
    
    loc_list= []
    for i in loc_array:
        loc_list.append(ast.literal_eval(i))
    
    df = pd.concat([df, pd.DataFrame(loc_list)], axis=1)
    df['latitude'] = df['latitude'].round(5)
    df['longitude'] = df['longitude'].round(5)
    return df

In [125]:
def get_geojson_grid(upper_right, lower_left, n=6):
    """Returns a grid of geojson rectangles, and computes the exposure in each section of the grid based on the vessel data.

    Parameters
    ----------
    upper_right: array_like
        The upper right hand corner of "grid of grids" (the default is the upper right hand [lat, lon] of the USA).

    lower_left: array_like
        The lower left hand corner of "grid of grids"  (the default is the lower left hand [lat, lon] of the USA).

    n: integer
        The number of rows/columns in the (n,n) grid.

    Returns
    -------

    list
        List of "geojson style" dictionary objects   
    """

    all_boxes = []

    lat_steps = np.linspace(lower_left[0], upper_right[0], n+1)
    lon_steps = np.linspace(lower_left[1], upper_right[1], n+1)

    lat_stride = lat_steps[1] - lat_steps[0]
    lon_stride = lon_steps[1] - lon_steps[0]

    for lat in lat_steps[:-1]:
        for lon in lon_steps[:-1]:
            # Define dimensions of box in grid
            upper_left = [lon, lat + lat_stride]
            upper_right = [lon + lon_stride, lat + lat_stride]
            lower_right = [lon + lon_stride, lat]
            lower_left = [lon, lat]

            # Define json coordinates for polygon
            coordinates = [
                upper_left,
                upper_right,
                lower_right,
                lower_left,
                upper_left
            ]

            geo_json = {"type": "FeatureCollection",
                        "properties":{
                            "lower_left": lower_left,
                            "upper_right": upper_right
                        },
                        "features":[]}

            grid_feature = {
                "type":"Feature",
                "geometry":{
                    "type":"Polygon",
                    "coordinates": [coordinates],
                }
            }

            geo_json["features"].append(grid_feature)

            all_boxes.append(geo_json)

    return all_boxes

In [126]:
def add_grid_location(df, n=42):
    
    '''
    
    Assigns each data point to a location on the grid according to its lat/long
    
    '''
    
    
    top_right = [df['latitude'].max(), df['longitude'].max()]
    top_left = [df['latitude'].min(), df['longitude'].min()]
    
    grid = get_geojson_grid(top_right, top_left, n=42)
    
    for i, box in enumerate(grid):
        upper_right = box["properties"]["upper_right"]
        lower_left = box["properties"]["lower_left"]
    
        mask = (
            (new_df.latitude <= upper_right[1]) & (new_df.latitude >= lower_left[1]) &
            (new_df.longitude <= upper_right[0]) & (new_df.longitude >= lower_left[0])
           )
    
        column_name = 'grid_location'
        df.loc[mask, column_name] = i
    
    return df

In [127]:
def add_time_chunk(df):
    df['time_chunk'] = 0
    for i, time in enumerate(df['time'].unique()):
        df['time_chunk'][df['time'] == time] = i
    return df

In [128]:
def add_day_of_week(df):
    df['my_dates'] = pd.to_datetime(df['time']).dt.round("D")
    df['day_of_week'] = df['my_dates'].dt.day_name()
    
    return df

In [129]:
def add_rounded_time(df, interval=15):
    '''
    Adds a column with the rounded time to the interval specified.
    
    '''
    
    df['rounded_time'] = pd.to_datetime(new_df['time']).dt.round("Min").apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour,15*round((float(dt.minute) + float(dt.second)/60) / interval)))
    df['rounded_time'] = pd.Series([val.time() for val in df['rounded_time']])
    
    return df

In [130]:
def add_wait_time(df):
    '''
    Adds a column that tells how long a scooter has been waiting in a location
    
    '''
    final_df = pd.DataFrame()
    for i in range(len(df['id'].unique())):
        
        # set this df to all the data points with the same id
        temp_df = df[df['id'] == df['id'].unique()[i]]
        
        
        for j in range(len(temp_df['latitude'].unique())):
            
            # set this df to iterate through all of the unique lats from the temp data set
            same_lat_long_df = temp_df[temp_df['latitude'] == temp_df['latitude'].iloc[j]]
        
            # create new column 'wait_time' that is the difference in time between the first and last datapoints
            same_lat_long_df['wait_time'] = pd.to_datetime(same_lat_long_df['time']).iloc[-1] - pd.to_datetime(same_lat_long_df['time']).iloc[0]
            
            #append the new column to the output df
            final_df = final_df.append(same_lat_long_df)
    
    return final_df


In [133]:
def drop_repeated_data(df):
    '''
    Removes repeated data based on id and rounded_time - this should cut the data down by more than half
    '''
    
    df.drop_duplicates(subset=['id','rounded_time'], keep='first', inplace=True)
    return df

In [142]:
df = get_data()
df = add_day_of_week(df)
df = add_grid_location(df)
df = add_rounded_time(df)
df = add_wait_time(df)
df = drop_repeated_data(df)


  if self.run_code(code, result):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,battery_level,id,time,latitude,longitude,count,grid_location,my_dates,day_of_week,rounded_time,wait_time
0,44,cd6ff74b-cba6-49fe-b115-cec6066fe919,2018-11-09 00:55:30.991958,37.84524,-122.29350,1,1346,2018-11-09,Friday,00:45:00,00:02:36.637355
1,48,e20d8a6c-2453-494a-a083-00bad445b189,2018-11-09 00:55:30.991958,37.84600,-122.29569,1,1345,2018-11-09,Friday,00:45:00,00:02:36.637355
2,48,c8f9bed6-48bd-405d-a290-84bd0148a700,2018-11-09 00:55:30.991958,37.84543,-122.28824,1,1347,2018-11-09,Friday,00:45:00,00:00:33.817978
3,38,b7ac0ffe-77dd-40dc-82f1-cb59e0f6fb6b,2018-11-09 00:55:30.991958,37.84618,-122.28023,1,1392,2018-11-09,Friday,00:45:00,00:02:40.802915
4,81,7e9d6f7f-90ba-4205-b040-1c5025512467,2018-11-09 00:55:30.991958,37.82930,-122.28083,1,1140,2018-11-09,Friday,00:45:00,00:02:49.136885
5,95,7894be32-1a47-445c-9ea8-80c5ea91d2be,2018-11-09 00:55:30.991958,37.83111,-122.27547,1,1141,2018-11-09,Friday,00:45:00,00:02:49.136885
6,39,fd812e62-f151-4c45-975b-614453957ba5,2018-11-09 00:55:30.991958,37.83117,-122.27541,1,1141,2018-11-09,Friday,00:45:00,00:02:49.136885
7,78,e409b900-e794-4ab9-9266-2a3912fec60b,2018-11-09 00:55:30.991958,37.82357,-122.28368,1,1055,2018-11-09,Friday,00:45:00,00:00:54.663976
8,59,1ffebfc0-9d1f-4bae-ad45-81d05a97ebab,2018-11-09 00:55:30.991958,37.83410,-122.27302,1,1184,2018-11-09,Friday,00:45:00,00:02:24.125107
9,68,87f744f1-cad9-47b1-8474-e78516c0fb58,2018-11-09 00:55:30.991958,37.84111,-122.27181,1,1310,2018-11-09,Friday,00:45:00,00:02:49.136885


In [235]:
def grid_count(df, n=42):
    '''
    For a given time, day of the week create df of number of scooters in each grid location
    '''
    
    new_df = pd.DataFrame()
    for unique_date in df['my_dates'].unique():
        for unique_rounded_time in df['rounded_time'].unique():
            for unique_grid_location in range(n**2):
                if ((df['my_dates'] == unique_date) & (df['rounded_time'] == unique_rounded_time) & (df['grid_location'] == unique_grid_location)).any():
                    continue
                else:
                    #print (unique_grid_location)
                    new_df = new_df.append({'rounded_time':unique_rounded_time, 'grid_location':unique_grid_location, 'my_dates':unique_date}, ignore_index=True)
    new_df['grid_location'] = new_df['grid_location'].astype(int)                
    
    df3 = pd.concat([df,new_df])
    df3 = df3.fillna(0)
    df3 = df3.sort_values(by='grid_location')
    #for a given date and rounded time - check to see if there is a grid location, if not set count to 0.
    
    #df3 = df3.groupby(['rounded_time', 'grid_location', 'day_of_week', 'my_dates']).size().reset_index(name='counts')
    #df = df.groupby(['grid_location']).agg(['count'])
    return df3

In [245]:
new_df = grid_count(df)
new_df
#df3 = pd.concat([df,new_df])
#df3.drop_duplicates(subset=['grid_location', 'col3'], inplace=True, keep='last')
new_df = new_df.groupby(['rounded_time', 'grid_location', 'my_dates'], as_index=False)[['count']].sum()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [248]:
#new_df = new_df.groupby(['rounded_time', 'grid_location', 'my_dates'])['count'].sum()
new_df['count'].sum()

634.0

In [197]:
type(df['grid_location'].iloc[0])

numpy.int64

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
def MVP(df):
    
    

    y = df[]
    X = 
    