In [1]:
import json
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import folium
from folium.plugins import MarkerCluster
import datetime

In [2]:
def get_data():

    df = pd.read_csv('bird_data.csv')
    df.drop(['code', 'captive'], axis=1, inplace=True)

    temp_df = df.head(100)

    new_df = add_lat_long(temp_df)
    new_df['count'] = 1
    new_df['grid_location'] = 0
    new_df.drop(['location'], axis=1, inplace=True)
    return new_df

In [3]:
def add_lat_long(df):
    loc_array = df['location']
    
    loc_list= []
    for i in loc_array:
        loc_list.append(ast.literal_eval(i))
    
    df = pd.concat([df, pd.DataFrame(loc_list)], axis=1)
    df['latitude'] = df['latitude'].round(5)
    df['longitude'] = df['longitude'].round(5)
    return df

In [4]:
def get_geojson_grid(upper_right, lower_left, n=6):
    """Returns a grid of geojson rectangles, and computes the exposure in each section of the grid based on the vessel data.

    Parameters
    ----------
    upper_right: array_like
        The upper right hand corner of "grid of grids" (the default is the upper right hand [lat, lon] of the USA).

    lower_left: array_like
        The lower left hand corner of "grid of grids"  (the default is the lower left hand [lat, lon] of the USA).

    n: integer
        The number of rows/columns in the (n,n) grid.

    Returns
    -------

    list
        List of "geojson style" dictionary objects   
    """

    all_boxes = []

    lat_steps = np.linspace(lower_left[0], upper_right[0], n+1)
    lon_steps = np.linspace(lower_left[1], upper_right[1], n+1)

    lat_stride = lat_steps[1] - lat_steps[0]
    lon_stride = lon_steps[1] - lon_steps[0]

    for lat in lat_steps[:-1]:
        for lon in lon_steps[:-1]:
            # Define dimensions of box in grid
            upper_left = [lon, lat + lat_stride]
            upper_right = [lon + lon_stride, lat + lat_stride]
            lower_right = [lon + lon_stride, lat]
            lower_left = [lon, lat]

            # Define json coordinates for polygon
            coordinates = [
                upper_left,
                upper_right,
                lower_right,
                lower_left,
                upper_left
            ]

            geo_json = {"type": "FeatureCollection",
                        "properties":{
                            "lower_left": lower_left,
                            "upper_right": upper_right
                        },
                        "features":[]}

            grid_feature = {
                "type":"Feature",
                "geometry":{
                    "type":"Polygon",
                    "coordinates": [coordinates],
                }
            }

            geo_json["features"].append(grid_feature)

            all_boxes.append(geo_json)

    return all_boxes

In [5]:
def add_grid_location(df, n=42):
    
    '''
    
    Assigns each data point to a location on the grid according to its lat/long
    
    '''
    
    
    top_right = [df['latitude'].max(), df['longitude'].max()]
    top_left = [df['latitude'].min(), df['longitude'].min()]
    
    grid = get_geojson_grid(top_right, top_left, n=42)
    
    for i, box in enumerate(grid):
        upper_right = box["properties"]["upper_right"]
        lower_left = box["properties"]["lower_left"]
    
        mask = (
            (df.latitude <= upper_right[1]) & (df.latitude >= lower_left[1]) &
            (df.longitude <= upper_right[0]) & (df.longitude >= lower_left[0])
           )
    
        column_name = 'grid_location'
        df.loc[mask, column_name] = i
    
    return df

In [6]:
def add_time_chunk(df):
    df['time_chunk'] = 0
    for i, time in enumerate(df['time'].unique()):
        df['time_chunk'][df['time'] == time] = i
    return df

In [7]:
def add_day_of_week(df):
    df['date'] = pd.to_datetime(df['time']).dt.round("D")
    df['day_of_week'] = df['date'].dt.day_name()
    
    return df

In [8]:
def add_rounded_time(df, interval=15):
    '''
    Adds a column with the rounded time to the interval specified.
    
    '''
    df['rounded_time'] = pd.to_datetime(df['time']).dt.round('15min')  
    #df['rounded_time'] = pd.to_datetime(df['time']).dt.round("Min").apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour,15*round((float(dt.minute) + float(dt.second)/60) / interval)))
    #df['rounded_time'] = pd.Series([val.time() for val in df['rounded_time']])
    
    df_time = pd.to_datetime(df["rounded_time"])

    df['rounded_time'] = df_time.dt.hour*60+df_time.dt.minute*60 + df_time.dt.second
    
    
    return df

In [9]:
def add_wait_time(df):
    '''
    Adds a column that tells how long a scooter has been waiting in a location
    
    '''
    final_df = pd.DataFrame()
    for i in range(len(df['id'].unique())):
        
        # set this df to all the data points with the same id
        temp_df = df[df['id'] == df['id'].unique()[i]]
        
        
        for j in range(len(temp_df['latitude'].unique())):
            
            # set this df to iterate through all of the unique lats from the temp data set
            same_lat_long_df = temp_df[temp_df['latitude'] == temp_df['latitude'].iloc[j]]
        
            # create new column 'wait_time' that is the difference in time between the first and last datapoints
            same_lat_long_df['wait_time'] = pd.to_datetime(same_lat_long_df['time']).iloc[-1] - pd.to_datetime(same_lat_long_df['time']).iloc[0]
            
            #append the new column to the output df
            final_df = final_df.append(same_lat_long_df)
    
    return final_df


In [10]:
def drop_repeated_data(df):
    '''
    Removes repeated data based on id and rounded_time - this should cut the data down by more than half
    '''
    
    df.drop_duplicates(subset=['id','rounded_time'], keep='first', inplace=True)
    return df

In [11]:
df = get_data()
df.head()

  if self.run_code(code, result):


Unnamed: 0,battery_level,id,time,latitude,longitude,count,grid_location
0,44,cd6ff74b-cba6-49fe-b115-cec6066fe919,2018-11-09 00:55:30.991958,37.84524,-122.2935,1,0
1,48,e20d8a6c-2453-494a-a083-00bad445b189,2018-11-09 00:55:30.991958,37.846,-122.29569,1,0
2,48,c8f9bed6-48bd-405d-a290-84bd0148a700,2018-11-09 00:55:30.991958,37.84543,-122.28824,1,0
3,38,b7ac0ffe-77dd-40dc-82f1-cb59e0f6fb6b,2018-11-09 00:55:30.991958,37.84618,-122.28023,1,0
4,81,7e9d6f7f-90ba-4205-b040-1c5025512467,2018-11-09 00:55:30.991958,37.8293,-122.28083,1,0


In [12]:
df = add_day_of_week(df)
df.head()

Unnamed: 0,battery_level,id,time,latitude,longitude,count,grid_location,date,day_of_week
0,44,cd6ff74b-cba6-49fe-b115-cec6066fe919,2018-11-09 00:55:30.991958,37.84524,-122.2935,1,0,2018-11-09,Friday
1,48,e20d8a6c-2453-494a-a083-00bad445b189,2018-11-09 00:55:30.991958,37.846,-122.29569,1,0,2018-11-09,Friday
2,48,c8f9bed6-48bd-405d-a290-84bd0148a700,2018-11-09 00:55:30.991958,37.84543,-122.28824,1,0,2018-11-09,Friday
3,38,b7ac0ffe-77dd-40dc-82f1-cb59e0f6fb6b,2018-11-09 00:55:30.991958,37.84618,-122.28023,1,0,2018-11-09,Friday
4,81,7e9d6f7f-90ba-4205-b040-1c5025512467,2018-11-09 00:55:30.991958,37.8293,-122.28083,1,0,2018-11-09,Friday


In [13]:
df = add_grid_location(df)
df.head()

Unnamed: 0,battery_level,id,time,latitude,longitude,count,grid_location,date,day_of_week
0,44,cd6ff74b-cba6-49fe-b115-cec6066fe919,2018-11-09 00:55:30.991958,37.84524,-122.2935,1,1266,2018-11-09,Friday
1,48,e20d8a6c-2453-494a-a083-00bad445b189,2018-11-09 00:55:30.991958,37.846,-122.29569,1,1306,2018-11-09,Friday
2,48,c8f9bed6-48bd-405d-a290-84bd0148a700,2018-11-09 00:55:30.991958,37.84543,-122.28824,1,1312,2018-11-09,Friday
3,38,b7ac0ffe-77dd-40dc-82f1-cb59e0f6fb6b,2018-11-09 00:55:30.991958,37.84618,-122.28023,1,1319,2018-11-09,Friday
4,81,7e9d6f7f-90ba-4205-b040-1c5025512467,2018-11-09 00:55:30.991958,37.8293,-122.28083,1,773,2018-11-09,Friday


In [14]:
df.describe()

Unnamed: 0,latitude,longitude,count,grid_location
count,100.0,100.0,100.0,100.0
mean,37.821392,-122.27278,1.0,546.13
std,0.012314,0.01174,0.0,392.646258
min,37.80416,-122.30041,1.0,9.0
25%,37.811917,-122.280088,1.0,233.0
50%,37.817515,-122.27163,1.0,421.0
75%,37.83042,-122.263477,1.0,833.25
max,37.85988,-122.25227,1.0,1751.0


In [15]:
df = add_rounded_time(df)

df.describe()

Unnamed: 0,latitude,longitude,count,grid_location,rounded_time
count,100.0,100.0,100.0,100.0,100.0
mean,37.821392,-122.27278,1.0,546.13,60.0
std,0.012314,0.01174,0.0,392.646258,0.0
min,37.80416,-122.30041,1.0,9.0,60.0
25%,37.811917,-122.280088,1.0,233.0,60.0
50%,37.817515,-122.27163,1.0,421.0,60.0
75%,37.83042,-122.263477,1.0,833.25,60.0
max,37.85988,-122.25227,1.0,1751.0,60.0


In [16]:
#df['rounded_time'] = df['rounded_time'].dt.seconds()
df.head()

Unnamed: 0,battery_level,id,time,latitude,longitude,count,grid_location,date,day_of_week,rounded_time
0,44,cd6ff74b-cba6-49fe-b115-cec6066fe919,2018-11-09 00:55:30.991958,37.84524,-122.2935,1,1266,2018-11-09,Friday,60
1,48,e20d8a6c-2453-494a-a083-00bad445b189,2018-11-09 00:55:30.991958,37.846,-122.29569,1,1306,2018-11-09,Friday,60
2,48,c8f9bed6-48bd-405d-a290-84bd0148a700,2018-11-09 00:55:30.991958,37.84543,-122.28824,1,1312,2018-11-09,Friday,60
3,38,b7ac0ffe-77dd-40dc-82f1-cb59e0f6fb6b,2018-11-09 00:55:30.991958,37.84618,-122.28023,1,1319,2018-11-09,Friday,60
4,81,7e9d6f7f-90ba-4205-b040-1c5025512467,2018-11-09 00:55:30.991958,37.8293,-122.28083,1,773,2018-11-09,Friday,60


In [17]:
df = add_wait_time(df)

In [18]:
df = drop_repeated_data(df)

In [19]:
def grid_count(df, n=42):
    '''
    For a given time, day of the week create df of number of scooters in each grid location
    '''
    
    new_df = pd.DataFrame()
    for unique_date in df['date'].unique():
        for unique_rounded_time in df['rounded_time'].unique():
            for unique_grid_location in range(n**2):
                if ((df['date'] == unique_date) & (df['rounded_time'] == unique_rounded_time) & (df['grid_location'] == unique_grid_location)).any():
                    continue
                else:
                    #print (unique_grid_location)
                    new_df = new_df.append({'rounded_time':unique_rounded_time, 'grid_location':unique_grid_location, 'date':unique_date}, ignore_index=True)
    new_df['grid_location'] = new_df['grid_location'].astype(int)                
    
    df3 = pd.concat([df,new_df])
    df3 = df3.fillna(0)
    df3 = df3.sort_values(by='grid_location')
    #for a given date and rounded time - check to see if there is a grid location, if not set count to 0.
    
    #df3 = df3.groupby(['rounded_time', 'grid_location', 'day_of_week', 'date']).size().reset_index(name='counts')
    #df = df.groupby(['grid_location']).agg(['count'])
    return df3

In [20]:
df.head()

Unnamed: 0,battery_level,id,time,latitude,longitude,count,grid_location,date,day_of_week,rounded_time,wait_time
0,44,cd6ff74b-cba6-49fe-b115-cec6066fe919,2018-11-09 00:55:30.991958,37.84524,-122.2935,1,1266,2018-11-09,Friday,60,0 days
1,48,e20d8a6c-2453-494a-a083-00bad445b189,2018-11-09 00:55:30.991958,37.846,-122.29569,1,1306,2018-11-09,Friday,60,0 days
2,48,c8f9bed6-48bd-405d-a290-84bd0148a700,2018-11-09 00:55:30.991958,37.84543,-122.28824,1,1312,2018-11-09,Friday,60,0 days
3,38,b7ac0ffe-77dd-40dc-82f1-cb59e0f6fb6b,2018-11-09 00:55:30.991958,37.84618,-122.28023,1,1319,2018-11-09,Friday,60,0 days
4,81,7e9d6f7f-90ba-4205-b040-1c5025512467,2018-11-09 00:55:30.991958,37.8293,-122.28083,1,773,2018-11-09,Friday,60,0 days


In [21]:
new_df = grid_count(df)
new_df
#df3 = pd.concat([df,new_df])
#df3.drop_duplicates(subset=['grid_location', 'col3'], inplace=True, keep='last')
new_df = new_df.groupby(['rounded_time', 'grid_location', 'date'], as_index=False)[['count']].sum()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [22]:
#new_df = new_df.groupby(['rounded_time', 'grid_location', 'date'])['count'].sum()
new_df.copy()
merge_df = pd.merge(new_df,new_df,how='left', left_on='grid_location', right_on='grid_location')
double_merge = merge_df[merge_df['rounded_time_x']>merge_df['rounded_time_y']]
double_merge[double_merge['grid_location']==0]

Unnamed: 0,rounded_time_x,grid_location,date_x,count_x,rounded_time_y,date_y,count_y


In [23]:
new_df['count'].sum()

100.0

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score

In [58]:
xdf = new_df.drop(['date'], axis=1)


neigh = KNeighborsClassifier(n_neighbors=3)

y = xdf['count']
X = xdf.drop('count', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

#print(X)

neigh.fit(X_train, y_train)

y_pred = neigh.predict(X_test).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

print(neigh.score(X_test, y_test))

print(recall_score(y_pred, y_test))


0.9909297052154195
0.0


  'recall', 'true', average, warn_for)


In [31]:
def MMVP(df):
    '''
    even more mvp than mvp
    '''
    
    y = df['count']
    X = df.drop('count', axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)
    
    
    linreg=LinearRegression()
    linreg.fit(X_train, y_train)
    
    y_pred = linreg.predict(X_test).reshape(-1,1)
    y_test = np.array(y_test).reshape(-1,1)
    
    #print(y_pred - y_test)
    print (np.shape(y_pred), np.shape(y_test))
    
    return linreg.score(X_test, y_test)


In [32]:
xdf = new_df.drop(['date'], axis=1)

In [33]:
MMVP(xdf)

(441, 1) (441, 1)


-0.0047306181384951795

In [None]:
def MVP(df):
    '''
    simplest linreg model
    '''
    
    df['date'] = pd.to_numeric(df.date)
    
    y = df['count']
    X = np.array(df.drop('count', axis=1))
    
    #no_of_splits = int((len(df)-3)/3)
    tscv = TimeSeriesSplit(n_splits=10)

    linreg=LinearRegression()
    
    for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        linreg.fit(X_train,y_train)
    
    
    y_pred=linreg.predict(X_test)
    
    return print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    
    for k, (train, test) in enumerate(k_fold.split(X, y)):
        lasso_cv.fit(X[train], y[train])
        print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
        format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))
    return (X)

In [None]:
MVP(new_df)

In [None]:
from sklearn.model_selection import TimeSeriesSplit

y = df['count']
X = df.loc[:, df.columns != 'count']

tscv = TimeSeriesSplit(n_splits=5)
print(tscv)  
TimeSeriesSplit(max_train_size=None, n_splits=5)
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc([train_index]), X.iloc([test_index])
    y_train, y_test = y[train_index], y[test_index]


In [None]:
from __future__ import print_function
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]

lasso = Lasso(random_state=0)
alphas = np.logspace(-4, -0.5, 30)

tuned_parameters = [{'alpha': alphas}]
n_folds = 5

clf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)
clf.fit(X, y)
scores = clf.cv_results_['mean_test_score']
scores_std = clf.cv_results_['std_test_score']
plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores)

# plot error lines showing +/- std. errors of the scores
std_error = scores_std / np.sqrt(n_folds)


# #############################################################################
# Bonus: how much can you trust the selection of alpha?

# To answer this question we use the LassoCV object that sets its alpha
# parameter automatically from the data by internal cross-validation (i.e. it
# performs cross-validation on the training data it receives).
# We use external cross-validation to see how much the automatically obtained
# alphas differ across different cross-validation folds.
lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=0)
k_fold = KFold(3)

print("Answer to the bonus question:",
      "how much can you trust the selection of alpha?")
print()
print("Alpha parameters maximising the generalization score on different")
print("subsets of the data:")
for k, (train, test) in enumerate(k_fold.split(X, y)):
    lasso_cv.fit(X[train], y[train])
    print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
          format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))
print()
print("Answer: Not very much since we obtained different alphas for different")
print("subsets of the data and moreover, the scores for these alphas differ")
print("quite substantially.")

plt.show()

In [None]:
def performTimeSeriesCV(X_train, y_train, number_folds, algorithm, parameters):
    """
    Given X_train and y_train (the test set is excluded from the Cross Validation),
    number of folds, the ML algorithm to implement and the parameters to test,
    the function acts based on the following logic: it splits X_train and y_train in a
    number of folds equal to number_folds. Then train on one fold and tests accuracy
    on the consecutive as follows:
    - Train on fold 1, test on 2
    - Train on fold 1-2, test on 3
    - Train on fold 1-2-3, test on 4
    ....
    Returns mean of test accuracies.
    """

    print 'Parameters --------------------------------> ', parameters
    print 'Size train set: ', X_train.shape
    
    # k is the size of each fold. It is computed dividing the number of 
    # rows in X_train by number_folds. This number is floored and coerced to int
    k = int(np.floor(float(X_train.shape[0]) / number_folds))
    print 'Size of each fold: ', k
    
    # initialize to zero the accuracies array. It is important to stress that
    # in the CV of Time Series if I have n folds I test n-1 folds as the first
    # one is always needed to train
    accuracies = np.zeros(folds-1)

    # loop from the first 2 folds to the total number of folds    
    for i in range(2, number_folds + 1):
        print ''
        
        # the split is the percentage at which to split the folds into train
        # and test. For example when i = 2 we are taking the first 2 folds out 
        # of the total available. In this specific case we have to split the
        # two of them in half (train on the first, test on the second), 
        # so split = 1/2 = 0.5 = 50%. When i = 3 we are taking the first 3 folds 
        # out of the total available, meaning that we have to split the three of them
        # in two at split = 2/3 = 0.66 = 66% (train on the first 2 and test on the
        # following)
        split = float(i-1)/i
        
        # example with i = 4 (first 4 folds):
        #      Splitting the first       4        chunks at          3      /        4
        print 'Splitting the first ' + str(i) + ' chunks at ' + str(i-1) + '/' + str(i) 
        
        # as we loop over the folds X and y are updated and increase in size.
        # This is the data that is going to be split and it increases in size 
        # in the loop as we account for more folds. If k = 300, with i starting from 2
        # the result is the following in the loop
        # i = 2
        # X = X_train[:(600)]
        # y = y_train[:(600)]
        #
        # i = 3
        # X = X_train[:(900)]
        # y = y_train[:(900)]
        # .... 
        X = X_train[:(k*i)]
        y = y_train[:(k*i)]
        print 'Size of train + test: ', X.shape # the size of the dataframe is going to be k*i

        # X and y contain both the folds to train and the fold to test.
        # index is the integer telling us where to split, according to the
        # split percentage we have set above
        index = int(np.floor(X.shape[0] * split))
        
        # folds used to train the model        
        X_trainFolds = X[:index]        
        y_trainFolds = y[:index]
        
        # fold used to test the model
        X_testFold = X[(index + 1):]
        y_testFold = y[(index + 1):]
        
        # i starts from 2 so the zeroth element in accuracies array is i-2. performClassification() is a function which takes care of a classification problem. This is only an example and you can replace this function with whatever ML approach you need.
        accuracies[i-2] = performClassification(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds, algorithm, parameters)
        
        # example with i = 4:
        #      Accuracy on fold         4     :    0.85423
        print 'Accuracy on fold ' + str(i) + ': ', acc[i-2]
    
    # the function returns the mean of the accuracy on the n-1 folds    
    return accuracies.mean()

In [None]:
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4],[1, 2], [3, 4],[3, 4],[1, 2],     [3, 4],[3, 4],[1, 2], [3, 4] ])
y = np.array([1, 2, 3, 4, 5, 6,7,8,9,10,11,12])
tscv = TimeSeriesSplit(n_splits=int((len(y)-3)/3))
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

    #To get the indices 
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]