# Data Wrangling & Data Cleaning (Notebook 1_Week 3 Deliverable)

### Import libraries

In [1]:
# import the library
%matplotlib inline

import pandas as pd #pandas library, data structures and data analysis tools for python
import numpy as np #numpy library, multi-dimensional container of generic data, and scientific use
import matplotlib.pyplot as plt #matplotlib for graphs, Python 2D plotting library

# convert scientific notation to foat "decimals"
pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Import the Dataset

In [2]:
# Source for dataset: http://donnees.ville.montreal.qc.ca/dataset/declarations-exterminations-punaises-de-lit/resource/6173de60-c2da-4d63-bc75-0607cb8dcb74
df_bedbugs = pd.read_csv('declarations-exterminations-punaises-de-lit.csv')

## Summarizing data for inspection

#### Types

In [3]:
#Ref.: https://www.geeksforgeeks.org/python-pandas-series-astype-to-convert-data-type-of-series/
print(df_bedbugs.dtypes)

NO_DECLARATION        int64
DATE_DECLARATION     object
DATE_INSP_VISPRE     object
NBR_EXTERMIN        float64
DATE_DEBUTTRAIT      object
DATE_FINTRAIT        object
No_QR                object
NOM_QR               object
NOM_ARROND           object
COORD_X             float64
COORD_Y             float64
LONGITUDE           float64
LATITUDE            float64
dtype: object


#### Converting data types

In [4]:
#Ref [1]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html

# Convert 'DATE_INSP_VISPRE', 'DATE_DEBUTTRAIT', and 'DATE_FINTRAIT'
df_bedbugs['DATE_INSP_VISPRE'] = pd.to_datetime(df_bedbugs['DATE_INSP_VISPRE'])
df_bedbugs['DATE_DEBUTTRAIT'] = pd.to_datetime(df_bedbugs['DATE_DEBUTTRAIT'])
df_bedbugs['DATE_FINTRAIT'] = pd.to_datetime(df_bedbugs['DATE_FINTRAIT'])

# print
print(df_bedbugs['DATE_DECLARATION'].head())

0    2012-10-28T16:36:04
1    2011-09-16T09:45:58
2    2011-11-08T14:01:04
3    2011-08-10T09:53:47
4    2011-10-26T10:11:32
Name: DATE_DECLARATION, dtype: object


In [5]:
# Convert 'DATE_DECLARATION to %Y-%m-%d %H:%M:%S
df_bedbugs['DATE_DECLARATION'] = pd.to_datetime(df_bedbugs['DATE_DECLARATION'])

# Ref. https://stackoverflow.com/questions/51310072/how-to-change-format-of-data-to-ymd-in-pandas
# Convert to string
df_bedbugs['DATE_DECLARATION'] = df_bedbugs['DATE_DECLARATION'].dt.strftime('%Y-%m-%d')

# Convert 'DATE_DECLARATION to %Y-%m-%d
df_bedbugs['DATE_DECLARATION'] = pd.to_datetime(df_bedbugs['DATE_DECLARATION'])

print(df_bedbugs['DATE_DECLARATION'].head())

0   2012-10-28
1   2011-09-16
2   2011-11-08
3   2011-08-10
4   2011-10-26
Name: DATE_DECLARATION, dtype: datetime64[ns]


In [6]:
# Print converted data types
print(df_bedbugs.dtypes)

NO_DECLARATION               int64
DATE_DECLARATION    datetime64[ns]
DATE_INSP_VISPRE    datetime64[ns]
NBR_EXTERMIN               float64
DATE_DEBUTTRAIT     datetime64[ns]
DATE_FINTRAIT       datetime64[ns]
No_QR                       object
NOM_QR                      object
NOM_ARROND                  object
COORD_X                    float64
COORD_Y                    float64
LONGITUDE                  float64
LATITUDE                   float64
dtype: object


### Columns, Head, and Describe Dataset

In [7]:
# view the dataframe index
df_bedbugs.index

RangeIndex(start=0, stop=33365, step=1)

In [8]:
# view the dataframe shape
df_bedbugs.shape

(33365, 13)

In [9]:
len(df_bedbugs)

33365

In [10]:
print('Bed bug extermination declarations')
print('')
print('==================COLUMNS==================')
print(df_bedbugs.columns)
print('')
print('==================HEAD==================')
print(df_bedbugs.head())
print('')
print('==================DESCRIBE==================')
print(df_bedbugs.describe())
print('')
print('==================COUNT==================')
print(df_bedbugs.count())

Bed bug extermination declarations

Index(['NO_DECLARATION', 'DATE_DECLARATION', 'DATE_INSP_VISPRE',
       'NBR_EXTERMIN', 'DATE_DEBUTTRAIT', 'DATE_FINTRAIT', 'No_QR', 'NOM_QR',
       'NOM_ARROND', 'COORD_X', 'COORD_Y', 'LONGITUDE', 'LATITUDE'],
      dtype='object')

   NO_DECLARATION DATE_DECLARATION DATE_INSP_VISPRE  NBR_EXTERMIN  \
0            4254       2012-10-28       2012-09-21          1.00   
1             830       2011-09-16       2011-07-13          1.00   
2            1380       2011-11-08       2011-11-02          1.00   
3             455       2011-08-10       2011-08-09          1.00   
4            1243       2011-10-26       2011-09-16          1.00   

  DATE_DEBUTTRAIT DATE_FINTRAIT No_QR         NOM_QR  \
0      2012-09-21    2012-09-21    24     Beaurivage   
1      2011-07-27    2011-08-17    50    Saint-Henri   
2      2011-11-07    2011-11-21    30   Sainte-Marie   
3      2011-08-09    2011-08-09    44  Upper Lachine   
4      2011-10-05    2011-10-05   

#### Fixing column name

In [11]:
df_bedbugs.columns = ['NO_DECLARATION', 'DATE_DECLARATION', 'DATE_PRIOR_INSP', 'EXT_FREQ','DATE_FIRST_EXT','DATE_LAST_EXT',
                      'HOOD_NUM','HOOD_NAME','BORO_NAME','MTM8_X','MTM8_Y','LONGITUDE','LATITUDE']
print('==================COLUMNS==================')
print(df_bedbugs.columns)
print('')

Index(['NO_DECLARATION', 'DATE_DECLARATION', 'DATE_PRIOR_INSP', 'EXT_FREQ',
       'DATE_FIRST_EXT', 'DATE_LAST_EXT', 'HOOD_NUM', 'HOOD_NAME', 'BORO_NAME',
       'MTM8_X', 'MTM8_Y', 'LONGITUDE', 'LATITUDE'],
      dtype='object')



#### Adding new columns for intersections

In [12]:
#create a new column with longitude and latitude for an intersection

# Ref.: https://stackoverflow.com/questions/19377969/combine-two-columns-of-text-in-dataframe-in-pandas-python/36911306
df_bedbugs['LONG_LAT'] = df_bedbugs['LONGITUDE'].map(str)+"_"+df_bedbugs['LATITUDE'].map(str)
df_bedbugs['LONG_LAT'].head()

0            -73.513411_45.588426
1            -73.585437_45.472569
2            -73.557668_45.521253
3            -73.611941_45.468327
4    -73.56756899999999_45.550652
Name: LONG_LAT, dtype: object

In [13]:
# Concatenate MTM8_X and MTM8_Y
#https://stackoverflow.com/questions/42520266/concatenating-two-floats-into-one-column-in-pandas
# df['MTMXY'] = [', '.join(str(x) for x in y) for y in map(tuple, df[['MTM8_X', 'MTM8_Y']].values)]

df_bedbugs['MTM_X_Y'] = df_bedbugs['MTM8_X'].map(str)+"_"+df_bedbugs['MTM8_Y'].map(str)
df_bedbugs['MTM_X_Y'].head()

0    303753.6_5049835.7
1    298119.8_5036963.7
2    300294.9_5042372.1
3    296046.9_5036494.9
4    299524.2_5045639.9
Name: MTM_X_Y, dtype: object

In [14]:
df_bedbugs.shape

(33365, 15)

#### Adding new columns: declaration month, and declaration row unit

In [15]:
# Create a column extracting month
df_bedbugs['DEC_MONTH'] = pd.DatetimeIndex(df_bedbugs['DATE_DECLARATION']).month

In [16]:
# Create Declared Incidents column for regression purposes
df_bedbugs['DEC_ISSUE'] = 1

#### Difference between Declaration and Inspection Dates

In [17]:
# Ref.: https://docs.scipy.org/doc/numpy/reference/arrays.datetime.html

# Create difference between Declaration and Inspection
df_bedbugs['DATE_DIFF'] = round( (df_bedbugs['DATE_DECLARATION'] - df_bedbugs['DATE_PRIOR_INSP'] ) 
                                / np.timedelta64(1,'D') )

#df['DATE_DIFF'].fillna(0, inplace=True)

In [18]:
print(df_bedbugs['DATE_DIFF'].describe())

count   33357.00
mean       37.33
std        78.70
min      -348.00
25%         6.00
50%        15.00
75%        39.00
max      3737.00
Name: DATE_DIFF, dtype: float64


#### Declarations at the same intersection (longitude and latitude)

In [19]:
print(str(len(df_bedbugs.groupby(['LONG_LAT'])))
      +' longitude and latitude group-locations')

4873 longitude and latitude group-locations


In [43]:
#verification
((df_bedbugs.groupby(['LONG_LAT'])
       .agg({'NO_DECLARATION':'count'}))
       .sort_values(by = ['NO_DECLARATION'],ascending=False)
       .rename(columns={'NO_DECLARATION':'DECL_QT'})
       .count())

DECL_QT    4873
dtype: int64

In [20]:
# Count of location : 1 = unique, and not equal to 1 is reoccuring
df_bedbugs.groupby(by = 'LONG_LAT')['NO_DECLARATION'].count().head()

LONG_LAT
-73.484312_45.694138    1
-73.485599_45.698515    1
-73.487241_45.695382    6
-73.487936_45.649078    3
-73.489228_45.693533    1
Name: NO_DECLARATION, dtype: int64

In [34]:
print('==================TOP_10_INTERSECTIONS_BASED_ON_DECLARATIONS==================')
((df_bedbugs.groupby(['LONG_LAT','HOOD_NAME','BORO_NAME'])
       .agg({'NO_DECLARATION':'count'}))
       .sort_values(by = ['NO_DECLARATION'],ascending=False)
       .rename(columns={'NO_DECLARATION':'DECL_QT'})
       .head())



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,DECL_QT
LONG_LAT,HOOD_NAME,BORO_NAME,Unnamed: 3_level_1
-73.571239_45.584338,Grande-Prairie,Saint-Léonard,264
-73.659233_45.569024,Sault-au-Récollet,Ahuntsic-Cartierville,181
-73.620588_45.551456,Crémazie,Villeray–Saint-Michel–Parc-Extension,151
-73.582592_45.49409,Montagne,Ville-Marie,148
-73.68714399999999_45.518173,Grenet,Saint-Laurent,139


In [30]:
# Ref. : hhttps://stackoverflow.com/questions/12765833/counting-the-number-of-true-booleans-in-a-python-list

# Number of reoccuring locations
print(str((df_bedbugs.groupby(by = 'LONG_LAT')['NO_DECLARATION'].count()>1).sum() )
    +' longitude and latitude reoccuring group-locations')

3395 longitude and latitude reoccuring group-locations


In [31]:
# Number of reoccuring locations, where DATE_FIRST_EXT and DATE_LAST_EXT at times co-inside
(df_bedbugs.groupby(by = 'LONG_LAT').count()>1).sum()

NO_DECLARATION      3395
DATE_DECLARATION    3395
DATE_PRIOR_INSP     3395
EXT_FREQ            3243
DATE_FIRST_EXT      3243
DATE_LAST_EXT       3243
HOOD_NUM            3395
HOOD_NAME           3395
BORO_NAME           3395
MTM8_X              3395
MTM8_Y              3395
LONGITUDE           3395
LATITUDE            3395
MTM_X_Y             3395
DEC_MONTH           3395
DEC_ISSUE           3395
DATE_DIFF           3395
dtype: int64

In [32]:
print('Out of '+str(len(df_bedbugs))
      + ' declarations, there are '+str(len(df_bedbugs.groupby(['LONG_LAT'])))+' intersections where '
      + str((df_bedbugs.groupby(by = 'LONG_LAT')['NO_DECLARATION'].count()>1).sum() )
      + ' intersections will more than one declaration.')
print('')
print('Also, there are case for intersections with more than one declaration, where the dates of the first and last extermination are repeated.')

Out of 33365 declarations, there are 4873 intersections where 3395 intersections will more than one declaration.

Also, there are case for intersections with more than one declaration, where the dates of the first and last extermination are repeated.


#### Intersections shared by boroughs and neighborhoods

In [51]:
# coordinates -73.587965_45.564218 show an intersection with near multi-unit building.

((df_null.groupby(['LONG_LAT','HOOD_NAME','BORO_NAME'])
       .agg({'EXT_FREQ': sum, 'NO_DECLARATION':'count'}))
       .sort_values(by = ['EXT_FREQ','NO_DECLARATION'],ascending=False)
       .rename(columns={'EXT_FREQ':'EXT_QT','NO_DECLARATION':'DECL_QT'})
       .head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EXT_QT,DECL_QT
LONG_LAT,HOOD_NAME,BORO_NAME,Unnamed: 3_level_1,Unnamed: 4_level_1
-73.587965_45.564218,François-Perrault,Villeray–Saint-Michel–Parc-Extension,0.0,64
-73.575695_45.510427,Milton-Parc,Le Plateau-Mont-Royal,0.0,32
-73.581705_45.493575,Montagne,Ville-Marie,0.0,25
-73.643845_45.466798,Loyola,Côte-des-Neiges–Notre-Dame-de-Grâce,0.0,24
-73.58131_45.490684,Montagne,Ville-Marie,0.0,21


In [52]:
((df_null[(df_null['LONG_LAT']=='-73.587965_45.564218')]
       .groupby(['LONG_LAT','HOOD_NAME','BORO_NAME'])
       .agg({'EXT_FREQ': sum, 'NO_DECLARATION':'count'}))
       .sort_values(by = ['EXT_FREQ','NO_DECLARATION'],ascending=False)
       .rename(columns={'EXT_FREQ':'EXT_QT','NO_DECLARATION':'DECL_QT'})
       .head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EXT_QT,DECL_QT
LONG_LAT,HOOD_NAME,BORO_NAME,Unnamed: 3_level_1,Unnamed: 4_level_1
-73.587965_45.564218,François-Perrault,Villeray–Saint-Michel–Parc-Extension,0.0,64
-73.587965_45.564218,Étienne Desmarteaux,Rosemont–La Petite-Patrie,0.0,1


In [57]:
((df_notnull[(df_notnull['LONG_LAT']=='-73.587965_45.564218')]
       .groupby(['LONG_LAT','HOOD_NAME','BORO_NAME'])
       .agg({'EXT_FREQ': sum, 'NO_DECLARATION':'count'}))
       .sort_values(by = ['EXT_FREQ','NO_DECLARATION'],ascending=False)
       .rename(columns={'EXT_FREQ':'EXT_QT','NO_DECLARATION':'DECL_QT'})
       .head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EXT_QT,DECL_QT
LONG_LAT,HOOD_NAME,BORO_NAME,Unnamed: 3_level_1,Unnamed: 4_level_1
-73.587965_45.564218,François-Perrault,Villeray–Saint-Michel–Parc-Extension,23.0,17
-73.587965_45.564218,Étienne Desmarteaux,Rosemont–La Petite-Patrie,12.0,9


-73.587965_45.564218 is a location intersection that falls on the border between two boroughs.

###### Review of intersections 

In [43]:
#verification
((df_bedbugs.groupby(['LONG_LAT'])
       .agg({'NO_DECLARATION':'count'}))
       .sort_values(by = ['NO_DECLARATION'],ascending=False)
       .rename(columns={'NO_DECLARATION':'DECL_QT'})
       .count())

DECL_QT    4873
dtype: int64

In [60]:
print('==================COUNT_OF_INTERSECTIONS_INCL_BOROUGH==================')
((df_bedbugs.groupby(['LONG_LAT','BORO_NAME'])
       .agg({'NO_DECLARATION':'count'}))
       .sort_values(by = ['NO_DECLARATION'],ascending=False)
       .rename(columns={'NO_DECLARATION':'DECL_QT'})
       .count())



DECL_QT    4921
dtype: int64

In [61]:
print('==================COUNT_OF_INTERSECTIONS_INCL_NEIGHBORHOOD==================')
((df_bedbugs.groupby(['LONG_LAT','HOOD_NAME'])
       .agg({'NO_DECLARATION':'count'}))
       .sort_values(by = ['NO_DECLARATION'],ascending=False)
       .rename(columns={'NO_DECLARATION':'DECL_QT'})
       .count())



DECL_QT    5052
dtype: int64

There is an overlap between boroughs for 48 intersections (=4921 - 4873).
There is an overlap between neighborhoods for 179 intersections (=5052 - 4873).

#### Group of declarations and considering number of events

In [None]:
# For declarations sharing intersections, the number of summed reports where more than 4 events occured is 1755
(df_bedbugs.groupby(by = 'LONG_LAT')['EXT_FREQ'].count()>4).sum()

In [None]:
# Verification
df_bedbugs.groupby(by = 'LONG_LAT')['EXT_FREQ'].count().head()

In [None]:
# Intersection with cases of more than one report
df_bedbugs[(df_bedbugs['LONGITUDE']==-73.490471) & (df_bedbugs['LATITUDE']==45.651309000000005)]

### Dealing with missing values
How to deal with the missing values? Should we remove the rows or fill the gap with a value?

#### Review structure of not null dataframe

In [54]:
df_notnull = df_bedbugs[pd.notnull(df_bedbugs['EXT_FREQ'])]
df_notnull.describe()

Unnamed: 0,NO_DECLARATION,EXT_FREQ,MTM8_X,MTM8_Y,LONGITUDE,LATITUDE,DEC_MONTH,DEC_ISSUE,DATE_DIFF
count,31241.0,31241.0,31241.0,31241.0,31241.0,31241.0,31241.0,31241.0,31241.0
mean,17061.36,1.5,297102.04,5044105.27,-73.6,45.54,7.13,1.0,36.79
std,9822.28,0.91,3671.99,4772.25,0.05,0.04,3.34,0.0,78.21
min,104.0,1.0,274266.1,5030733.0,-73.89,45.42,1.0,1.0,-348.0
25%,8563.0,1.0,294890.3,5041950.6,-73.63,45.52,4.0,1.0,6.0
50%,17028.0,1.0,297706.6,5044179.1,-73.59,45.54,8.0,1.0,15.0
75%,25555.0,2.0,299678.3,5046440.5,-73.57,45.56,10.0,1.0,39.0
max,34130.0,4.0,305921.5,5062070.2,-73.49,45.7,12.0,1.0,3737.0


In [55]:
# Review lead time between the declaration dates and prior inspection date, for null values
(df_notnull['DATE_DECLARATION'] - df_notnull['DATE_PRIOR_INSP']).describe()

count                      31241
mean     36 days 19:03:01.261803
std      78 days 05:02:23.472064
min          -348 days +00:00:00
25%              6 days 00:00:00
50%             15 days 00:00:00
75%             39 days 00:00:00
max           3737 days 00:00:00
dtype: object

#### Identify count of missing values

In [56]:
print('==================TYPES==================')
print(df_bedbugs.dtypes)
print('')
print('==================NULL_VALUES==================')
# Check for null values
print(df_bedbugs.isnull().sum())

NO_DECLARATION               int64
DATE_DECLARATION    datetime64[ns]
DATE_PRIOR_INSP     datetime64[ns]
EXT_FREQ                   float64
DATE_FIRST_EXT      datetime64[ns]
DATE_LAST_EXT       datetime64[ns]
HOOD_NUM                    object
HOOD_NAME                   object
BORO_NAME                   object
MTM8_X                     float64
MTM8_Y                     float64
LONGITUDE                  float64
LATITUDE                   float64
LONG_LAT                    object
MTM_X_Y                     object
DEC_MONTH                    int64
DEC_ISSUE                    int64
DATE_DIFF                  float64
dtype: object

NO_DECLARATION         0
DATE_DECLARATION       0
DATE_PRIOR_INSP        8
EXT_FREQ            2124
DATE_FIRST_EXT      2124
DATE_LAST_EXT       2124
HOOD_NUM               0
HOOD_NAME              0
BORO_NAME              0
MTM8_X                 0
MTM8_Y                 0
LONGITUDE              0
LATITUDE               0
LONG_LAT               0
MTM_X

#### Analyse Null dataframe 

In [47]:
# Populate blank values in EXT_FREQ 
df_null = df_bedbugs[pd.isnull(df_bedbugs['EXT_FREQ'])]
df_null.describe()

Unnamed: 0,NO_DECLARATION,EXT_FREQ,MTM8_X,MTM8_Y,LONGITUDE,LATITUDE,DEC_MONTH,DEC_ISSUE,DATE_DIFF
count,2124.0,0.0,2124.0,2124.0,2124.0,2124.0,2124.0,2124.0,2116.0
mean,17974.31,,296367.48,5043847.46,-73.61,45.53,7.26,1.0,45.19
std,9741.99,,3484.39,4513.88,0.04,0.04,3.12,0.0,85.3
min,356.0,,275320.3,5030909.0,-73.88,45.42,1.0,1.0,-27.0
25%,9083.5,,294450.4,5041870.0,-73.63,45.52,5.0,1.0,3.0
50%,18132.0,,295991.4,5043601.2,-73.61,45.53,7.0,1.0,17.0
75%,26906.25,,298803.0,5046221.33,-73.58,45.56,10.0,1.0,45.0
max,34114.0,,306021.8,5061583.8,-73.48,45.69,12.0,1.0,1165.0


In [48]:
df_null

Unnamed: 0,NO_DECLARATION,DATE_DECLARATION,DATE_PRIOR_INSP,EXT_FREQ,DATE_FIRST_EXT,DATE_LAST_EXT,HOOD_NUM,HOOD_NAME,BORO_NAME,MTM8_X,MTM8_Y,LONGITUDE,LATITUDE,LONG_LAT,MTM_X_Y,DEC_MONTH,DEC_ISSUE,DATE_DIFF
5,4331,2012-11-08,2012-10-10,,NaT,NaT,20,Marie-Victorin,Rosemont–La Petite-Patrie,298622.20,5047041.40,-73.58,45.56,-73.579138_45.563256,298622.2_5047041.4,11,1,29.00
25,3117,2012-07-09,2012-06-01,,NaT,NaT,70,Vieux-Lachine - Saint-Pierre,Lachine,291759.20,5033576.50,-73.67,45.44,-73.666698_45.441998999999996,291759.2_5033576.5,7,1,38.00
39,2202,2012-02-19,2011-11-02,,NaT,NaT,39,Montagne,Ville-Marie,298505.60,5039407.60,-73.58,45.49,-73.580535_45.494563,298505.6_5039407.6,2,1,109.00
41,1896,2011-12-22,2011-12-22,,NaT,NaT,11,Parc-Extension,Villeray–Saint-Michel–Parc-Extension,294566.70,5043001.20,-73.63,45.53,-73.631007_45.526853,294566.7_5043001.2,12,1,0.00
45,3367,2012-07-27,2012-07-16,,NaT,NaT,26,Longue-Pointe,Mercier–Hochelaga-Maisonneuve,301819.90,5048615.20,-73.54,45.58,-73.538186_45.577438,301819.9_5048615.2,7,1,11.00
111,7054,2013-06-12,2013-04-12,,NaT,NaT,14,Saint-Édouard,Rosemont–La Petite-Patrie,296124.70,5043220.90,-73.61,45.53,-73.611065_45.528851,296124.7_5043220.9,6,1,61.00
137,7412,2013-06-26,2013-06-17,,NaT,NaT,45,Loyola,Côte-des-Neiges–Notre-Dame-de-Grâce,293551.90,5036329.00,-73.64,45.47,-73.643845_45.466798,293551.9_5036329.0,6,1,9.00
167,10943,2014-02-03,2013-11-21,,NaT,NaT,11,Parc-Extension,Villeray–Saint-Michel–Parc-Extension,294815.80,5042767.20,-73.63,45.52,-73.627812_45.524751,294815.8_5042767.2,2,1,74.00
184,10069,2013-11-28,2013-10-15,,NaT,NaT,30,Sainte-Marie,Ville-Marie,300179.30,5043509.90,-73.56,45.53,-73.559159_45.531490999999995,300179.3_5043509.9,11,1,44.00
189,11591,2014-03-22,2014-03-12,,NaT,NaT,20,Marie-Victorin,Rosemont–La Petite-Patrie,298529.90,5048250.50,-73.58,45.57,-73.580336_45.574135,298529.9_5048250.5,3,1,10.00


In [49]:
# Review lead time between the declaration dates and prior inspection date, for null values
(df_null['DATE_DECLARATION'] - df_null['DATE_PRIOR_INSP']).describe()

count                       2116
mean     45 days 04:31:31.871455
std      85 days 07:08:45.898195
min           -27 days +00:00:00
25%              3 days 00:00:00
50%             17 days 00:00:00
75%             45 days 00:00:00
max           1165 days 00:00:00
dtype: object

In [50]:
((df_null.groupby(['BORO_NAME'])
       .agg({'EXT_FREQ': sum, 'NO_DECLARATION':'count'}))
       .sort_values(by = ['EXT_FREQ','NO_DECLARATION'],ascending=False)
       .rename(columns={'EXT_FREQ':'EXT_QT','NO_DECLARATION':'DECL_QT'})
       .head(5))

Unnamed: 0_level_0,EXT_QT,DECL_QT
BORO_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1
Villeray–Saint-Michel–Parc-Extension,0.0,606
Rosemont–La Petite-Patrie,0.0,226
Le Plateau-Mont-Royal,0.0,177
Ville-Marie,0.0,173
Mercier–Hochelaga-Maisonneuve,0.0,170


#### Populate blank values in EXT_FREQ

In [None]:
# Assumption that a least one extermination occurs per every declaration.

# Populate blank values in EXT_FREQ 
df_bedbugs['EXT_FREQ'].fillna(1, inplace=True)

#### Populate blanks with DATE_DECLARATION date

In [None]:
# Populate blanks with DATE_DECLARATION date
df_bedbugs['DATE_FIRST_EXT'] = df_bedbugs['DATE_FIRST_EXT'].fillna(df_bedbugs['DATE_DECLARATION'].dt.date)



In [None]:
print('==================NULL_VALUES==================')
# Check for null values
print(df_bedbugs.isnull().sum())

# Dealing with outliers

### Removing outliers

Removal of outliers is not an applicable step in the process. It is confirmed whether multiple extermination reports are split over several declarations, which causes a high variability in the dates and extermination reports per declaration. The maximumum amount of reports per declaration is set to four as per the scope dictionary. Until the relationship between the dates is understood, the use of a binomial distribution and subsequent removal of outliers cannot be enforced at this time.

### Merging Data Sets

In [None]:
df_bedbugs_base = df_bedbugs.copy()

### Saving the processed file

In [None]:
df_bedbugs_base.to_csv('declarations-exterminations-punaises-de-lit-1.csv', index=False)

In [None]:
print('==================TYPES==================')
print(df_bedbugs_base.dtypes)
print('')
print('==================COLUMNS==================')
print(df_bedbugs.columns)
print('')