In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

%matplotlib inline
sns.set_style('darkgrid')
sns.set_palette('viridis')

In [2]:
spray = pd.read_csv('../data/spray.csv')

In [3]:
spray.head()

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.39041,-88.088858


In [4]:
spray.isnull().sum()

Date           0
Time         584
Latitude       0
Longitude      0
dtype: int64

In [5]:
spray.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14835 entries, 0 to 14834
Data columns (total 4 columns):
Date         14835 non-null object
Time         14251 non-null object
Latitude     14835 non-null float64
Longitude    14835 non-null float64
dtypes: float64(2), object(2)
memory usage: 463.7+ KB


In [6]:
spray[spray['Time'].isnull()]

Unnamed: 0,Date,Time,Latitude,Longitude
1030,2011-09-07,,41.987092,-87.794286
1031,2011-09-07,,41.987620,-87.794382
1032,2011-09-07,,41.988004,-87.794574
1033,2011-09-07,,41.988292,-87.795486
1034,2011-09-07,,41.988100,-87.796014
1035,2011-09-07,,41.986372,-87.794862
1036,2011-09-07,,41.986228,-87.795582
1037,2011-09-07,,41.984836,-87.793998
1038,2011-09-07,,41.984836,-87.794670
1039,2011-09-07,,41.984884,-87.795198


In [7]:
spray.iloc[11499, :]

Date          2013-08-22
Time         10:53:16 PM
Latitude          41.724
Longitude       -87.6222
Name: 11499, dtype: object

In [8]:
spray[spray['Longitude']== -87.615892]

Unnamed: 0,Date,Time,Latitude,Longitude


In [9]:
order_groups = spray.groupby(['Latitude', 'Longitude'], as_index=False).count().sort_values('Date', ascending=False)

In [10]:
order_groups

Unnamed: 0,Latitude,Longitude,Date,Time
11853,41.986460,-87.794225,541,541
11499,41.983917,-87.793088,2,2
0,41.713925,-87.615892,1,1
9533,41.959113,-87.719752,1,1
9522,41.959028,-87.728890,1,1
9523,41.959052,-87.725095,1,1
9524,41.959055,-87.724518,1,1
9525,41.959055,-87.711887,1,1
9526,41.959070,-87.715542,1,1
9527,41.959080,-87.737005,1,1


In [11]:
order_groups.loc[11499, :]['Latitude']

41.9839166666667

In [12]:
lat_long_count = spray.drop('Time', axis=1).groupby(['Latitude', 'Longitude']).agg(['count'])

lat_long_count.columns

lat_long_count = pd.Series(lat_long_count.values.reshape(1,-1)[0], lat_long_count.index)

lat_long_count.sort_values(ascending=False).head()

Latitude   Longitude 
41.986460  -87.794225    541
41.983917  -87.793088      2
41.894413  -87.710262      1
41.894380  -87.772148      1
41.894343  -87.760688      1
dtype: int64

Through manual inspection, we have determined a spray entry which has been duplicated 541 times.  We are going to remove the duplicates from this list.

In [13]:
mask = (spray['Latitude'] == 41.986460) & (spray['Longitude'] == -87.794225)
spray[mask].shape

(541, 4)

In [14]:
spray[mask].index[1:]

Int64Index([ 490,  491,  492,  493,  494,  495,  496,  497,  498,  499,
            ...
            1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029],
           dtype='int64', length=540)

In [15]:
spray.drop(spray[mask].index[1:], inplace=True)

In [16]:
lat_long_count = spray.drop('Time', axis=1).groupby(['Latitude', 'Longitude']).agg(['count'])

lat_long_count = pd.Series(lat_long_count.values.reshape(1,-1)[0], lat_long_count.index)

lat_long_count.sort_values(ascending=False).head()

Latitude   Longitude 
41.983917  -87.793088    2
42.395983  -88.095757    1
41.894157  -87.754473    1
41.894380  -87.772148    1
41.894343  -87.760688    1
dtype: int64

In [17]:
mask = (spray['Longitude'] == -87.7930883333333) & (spray['Latitude'] ==  41.9839166666667)
sum(mask)

2

In [18]:
spray[mask].index[1:]

Int64Index([485], dtype='int64')

In [19]:
spray.drop(spray[mask].index[1:], inplace=True)

In [21]:
lat_long_count = spray.drop('Time', axis=1).groupby(['Latitude', 'Longitude']).agg(['count'])

lat_long_count = pd.Series(lat_long_count.values.reshape(1,-1)[0], lat_long_count.index)

lat_long_count.sort_values(ascending=False).head()

Latitude   Longitude 
42.395983  -88.095757    1
41.894160  -87.767937    1
41.894402  -87.704128    1
41.894380  -87.772148    1
41.894343  -87.760688    1
dtype: int64

In [22]:
spray.drop('Time', axis=1, inplace=True)

In [23]:
spray.to_csv('Spray_Cleaned.csv')