# Data Wrangling & Data Cleaning (Notebook 1_Week 3 Deliverable)

### Import libraries

In [1]:
# import the library
%matplotlib inline

import pandas as pd #pandas library, data structures and data analysis tools for python
import numpy as np #numpy library, multi-dimensional container of generic data, and scientific use
import matplotlib.pyplot as plt #matplotlib for graphs, Python 2D plotting library

# convert scientific notation to foat "decimals"
pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Load the data

In [2]:
df_bedbugs = pd.read_csv('declarations-exterminations-punaises-de-lit.csv')

## Summarizing data for inspection

#### Types

In [3]:
#Ref.: https://www.geeksforgeeks.org/python-pandas-series-astype-to-convert-data-type-of-series/
print(df_bedbugs.dtypes)

NO_DECLARATION        int64
DATE_DECLARATION     object
DATE_INSP_VISPRE     object
NBR_EXTERMIN        float64
DATE_DEBUTTRAIT      object
DATE_FINTRAIT        object
No_QR                object
NOM_QR               object
NOM_ARROND           object
COORD_X             float64
COORD_Y             float64
LONGITUDE           float64
LATITUDE            float64
dtype: object


#### Converting data types

In [4]:
#Ref [1]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html

# Convert 'DATE_INSP_VISPRE', 'DATE_DEBUTTRAIT', and 'DATE_FINTRAIT'
df_bedbugs['DATE_INSP_VISPRE'] = pd.to_datetime(df_bedbugs['DATE_INSP_VISPRE'])
df_bedbugs['DATE_DEBUTTRAIT'] = pd.to_datetime(df_bedbugs['DATE_DEBUTTRAIT'])
df_bedbugs['DATE_FINTRAIT'] = pd.to_datetime(df_bedbugs['DATE_FINTRAIT'])

# print
print(df_bedbugs['DATE_DECLARATION'].head())

0    2012-10-28T16:36:04
1    2011-09-16T09:45:58
2    2011-11-08T14:01:04
3    2011-08-10T09:53:47
4    2011-10-26T10:11:32
Name: DATE_DECLARATION, dtype: object


In [5]:
# Convert 'DATE_DECLARATION to %Y-%m-%d %H:%M:%S
df_bedbugs['DATE_DECLARATION'] = pd.to_datetime(df_bedbugs['DATE_DECLARATION'])

# Ref. https://stackoverflow.com/questions/51310072/how-to-change-format-of-data-to-ymd-in-pandas
# Convert to string
df_bedbugs['DATE_DECLARATION'] = df_bedbugs['DATE_DECLARATION'].dt.strftime('%Y-%m-%d')

# Convert 'DATE_DECLARATION to %Y-%m-%d
df_bedbugs['DATE_DECLARATION'] = pd.to_datetime(df_bedbugs['DATE_DECLARATION'])

print(df_bedbugs['DATE_DECLARATION'].head())

0   2012-10-28
1   2011-09-16
2   2011-11-08
3   2011-08-10
4   2011-10-26
Name: DATE_DECLARATION, dtype: datetime64[ns]


In [6]:
# Print converted data types
print(df_bedbugs.dtypes)

NO_DECLARATION               int64
DATE_DECLARATION    datetime64[ns]
DATE_INSP_VISPRE    datetime64[ns]
NBR_EXTERMIN               float64
DATE_DEBUTTRAIT     datetime64[ns]
DATE_FINTRAIT       datetime64[ns]
No_QR                       object
NOM_QR                      object
NOM_ARROND                  object
COORD_X                    float64
COORD_Y                    float64
LONGITUDE                  float64
LATITUDE                   float64
dtype: object


### Columns, Head, and Describe Dataset

In [7]:
# view the dataframe index
df_bedbugs.index

RangeIndex(start=0, stop=33365, step=1)

In [8]:
# view the dataframe shape
df_bedbugs.shape

(33365, 13)

In [9]:
len(df_bedbugs)

33365

In [10]:
print('Bed bug extermination declarations')
print('')
print('==================COLUMNS==================')
print(df_bedbugs.columns)
print('')
print('==================HEAD==================')
print(df_bedbugs.head())
print('')
print('==================DESCRIBE==================')
print(df_bedbugs.describe())
print('')
print('==================COUNT==================')
print(df_bedbugs.count())

Bed bug extermination declarations

Index(['NO_DECLARATION', 'DATE_DECLARATION', 'DATE_INSP_VISPRE',
       'NBR_EXTERMIN', 'DATE_DEBUTTRAIT', 'DATE_FINTRAIT', 'No_QR', 'NOM_QR',
       'NOM_ARROND', 'COORD_X', 'COORD_Y', 'LONGITUDE', 'LATITUDE'],
      dtype='object')

   NO_DECLARATION DATE_DECLARATION DATE_INSP_VISPRE  NBR_EXTERMIN  \
0            4254       2012-10-28       2012-09-21          1.00   
1             830       2011-09-16       2011-07-13          1.00   
2            1380       2011-11-08       2011-11-02          1.00   
3             455       2011-08-10       2011-08-09          1.00   
4            1243       2011-10-26       2011-09-16          1.00   

  DATE_DEBUTTRAIT DATE_FINTRAIT No_QR         NOM_QR  \
0      2012-09-21    2012-09-21    24     Beaurivage   
1      2011-07-27    2011-08-17    50    Saint-Henri   
2      2011-11-07    2011-11-21    30   Sainte-Marie   
3      2011-08-09    2011-08-09    44  Upper Lachine   
4      2011-10-05    2011-10-05   

### Rename columns

In [11]:
df_bedbugs.columns = ['NO_DECLARATION', 'DATE_DECLARATION', 'DATE_PRIOR_INSP', 'EXT_FREQ','DATE_FIRST_EXT','DATE_LAST_EXT',
                      'HOOD_NUM','HOOD_NAME','BORO_NAME','MTM8_X','MTM8_Y','LONGITUDE','LATITUDE']
print('==================COLUMNS==================')
print(df_bedbugs.columns)
print('')

Index(['NO_DECLARATION', 'DATE_DECLARATION', 'DATE_PRIOR_INSP', 'EXT_FREQ',
       'DATE_FIRST_EXT', 'DATE_LAST_EXT', 'HOOD_NUM', 'HOOD_NAME', 'BORO_NAME',
       'MTM8_X', 'MTM8_Y', 'LONGITUDE', 'LATITUDE'],
      dtype='object')



In [12]:
#create a new column with longitude and latitude for an intersection

# Ref.: https://stackoverflow.com/questions/19377969/combine-two-columns-of-text-in-dataframe-in-pandas-python/36911306
df_bedbugs['LONG_LAT'] = df_bedbugs['LONGITUDE'].map(str)+"_"+df_bedbugs['LATITUDE'].map(str)
df_bedbugs['LONG_LAT'].head()

0            -73.513411_45.588426
1            -73.585437_45.472569
2            -73.557668_45.521253
3            -73.611941_45.468327
4    -73.56756899999999_45.550652
Name: LONG_LAT, dtype: object

In [13]:
# Concatenate MTM8_X and MTM8_Y
#https://stackoverflow.com/questions/42520266/concatenating-two-floats-into-one-column-in-pandas
# df['MTMXY'] = [', '.join(str(x) for x in y) for y in map(tuple, df[['MTM8_X', 'MTM8_Y']].values)]

df_bedbugs['MTM_X_Y'] = df_bedbugs['MTM8_X'].map(str)+"_"+df_bedbugs['MTM8_Y'].map(str)
df_bedbugs['MTM_X_Y'].head()

0    303753.6_5049835.7
1    298119.8_5036963.7
2    300294.9_5042372.1
3    296046.9_5036494.9
4    299524.2_5045639.9
Name: MTM_X_Y, dtype: object

In [14]:
df_bedbugs.shape

(33365, 15)

#### Declarations at the same intersection (longitude and latitude)

In [15]:
print(str(len(df_bedbugs.groupby(['LONG_LAT'])))
      +' longitude and latitude group-locations')

4873 longitude and latitude group-locations


In [16]:
# Count of location : 1 = unique, and not equal to 1 is reoccuring
df_bedbugs.groupby(by = 'LONG_LAT')['NO_DECLARATION'].count().head()

LONG_LAT
-73.484312_45.694138    1
-73.485599_45.698515    1
-73.487241_45.695382    6
-73.487936_45.649078    3
-73.489228_45.693533    1
Name: NO_DECLARATION, dtype: int64

In [17]:
df_bedbugs.groupby(by = 'LONG_LAT')['NO_DECLARATION'].count().tail()

LONG_LAT
-73.881748_45.459389            1
-73.88297299999999_45.453919    1
-73.887183_45.453657            1
-73.889477_45.458117            1
-73.890405_45.456179            2
Name: NO_DECLARATION, dtype: int64

In [18]:
# Verification of reoccuring
df_bedbugs[(df_bedbugs['LONGITUDE']==-73.487241) & (df_bedbugs['LATITUDE']==45.695382)]

Unnamed: 0,NO_DECLARATION,DATE_DECLARATION,DATE_PRIOR_INSP,EXT_FREQ,DATE_FIRST_EXT,DATE_LAST_EXT,HOOD_NUM,HOOD_NAME,BORO_NAME,MTM8_X,MTM8_Y,LONGITUDE,LATITUDE,LONG_LAT,MTM_X_Y
2729,15213,2014-11-28,2014-11-04,1.0,2014-11-06,2014-11-06,54,Pointe-aux-Trembles,Rivière-des-Prairies–Pointe-aux-Trembles,305793.7,5061722.0,-73.49,45.7,-73.487241_45.695382,305793.7_5061722.0
8716,15772,2015-01-08,2014-12-29,1.0,2015-01-05,2015-01-05,54,Pointe-aux-Trembles,Rivière-des-Prairies–Pointe-aux-Trembles,305793.7,5061722.0,-73.49,45.7,-73.487241_45.695382,305793.7_5061722.0
8852,32152,2018-08-02,2018-06-08,1.0,2018-06-08,2018-06-08,54,Pointe-aux-Trembles,Rivière-des-Prairies–Pointe-aux-Trembles,305793.7,5061722.0,-73.49,45.7,-73.487241_45.695382,305793.7_5061722.0
15665,15393,2014-12-09,2014-11-17,1.0,2014-11-19,2014-11-19,54,Pointe-aux-Trembles,Rivière-des-Prairies–Pointe-aux-Trembles,305793.7,5061722.0,-73.49,45.7,-73.487241_45.695382,305793.7_5061722.0
18368,33208,2018-10-23,2018-10-19,1.0,2018-10-19,2018-10-19,54,Pointe-aux-Trembles,Rivière-des-Prairies–Pointe-aux-Trembles,305793.7,5061722.0,-73.49,45.7,-73.487241_45.695382,305793.7_5061722.0
26054,15693,2015-01-06,2014-12-04,1.0,2014-12-08,2014-12-08,54,Pointe-aux-Trembles,Rivière-des-Prairies–Pointe-aux-Trembles,305793.7,5061722.0,-73.49,45.7,-73.487241_45.695382,305793.7_5061722.0


In [19]:
# Verify group for any null values
(df_bedbugs.groupby(by = 'LONG_LAT').count()).isnull().sum()

NO_DECLARATION      0
DATE_DECLARATION    0
DATE_PRIOR_INSP     0
EXT_FREQ            0
DATE_FIRST_EXT      0
DATE_LAST_EXT       0
HOOD_NUM            0
HOOD_NAME           0
BORO_NAME           0
MTM8_X              0
MTM8_Y              0
LONGITUDE           0
LATITUDE            0
MTM_X_Y             0
dtype: int64

In [20]:
# Ref. : hhttps://stackoverflow.com/questions/12765833/counting-the-number-of-true-booleans-in-a-python-list

# Number of reoccuring locations
print(str((df_bedbugs.groupby(by = 'LONG_LAT')['NO_DECLARATION'].count()>1).sum() )
    +' longitude and latitude reoccuring group-locations')

3395 longitude and latitude reoccuring group-locations


In [21]:
# Number of reoccuring locations, where DATE_FIRST_EXT and DATE_LAST_EXT at times co-inside
(df_bedbugs.groupby(by = 'LONG_LAT').count()>1).sum()

NO_DECLARATION      3395
DATE_DECLARATION    3395
DATE_PRIOR_INSP     3395
EXT_FREQ            3243
DATE_FIRST_EXT      3243
DATE_LAST_EXT       3243
HOOD_NUM            3395
HOOD_NAME           3395
BORO_NAME           3395
MTM8_X              3395
MTM8_Y              3395
LONGITUDE           3395
LATITUDE            3395
MTM_X_Y             3395
dtype: int64

#### Group of declarations

In [22]:
df_bedbugs(['LONGITUDE','LATITUDE']).size()>4

TypeError: 'DataFrame' object is not callable

### Dealing with missing values
How to deal with the missing values? Should we remove the rows or fill the gap with a value?

In [23]:
# Number of missing values by columns
print('==================COLUMNS_WITH_MISSING_VALUES==================')
print(df_bedbugs.isnull().sum())
print('')

NO_DECLARATION         0
DATE_DECLARATION       0
DATE_PRIOR_INSP        8
EXT_FREQ            2124
DATE_FIRST_EXT      2124
DATE_LAST_EXT       2124
HOOD_NUM               0
HOOD_NAME              0
BORO_NAME              0
MTM8_X                 0
MTM8_Y                 0
LONGITUDE              0
LATITUDE               0
LONG_LAT               0
MTM_X_Y                0
dtype: int64



### Removing outliers
Some algorithms are very sensitive to outliers. Considering the number of bedrooms, should we remove houses with an extreme number of bedrooms? How many bedrooms are too many? (Suggestion: as a rule of thumb, three standard deviations from the mean is a good measure to identify outliers).

# Dealing with outliers
houses_to_remove = []


### Merging Data Sets

In [24]:
df_bedbugs_base = df_bedbugs.copy()

### Saving the processed file

In [25]:
df_bedbugs_base.to_csv('declarations-exterminations-punaises-de-lit-1.csv', index=False)

In [27]:
print(df_bedbugs_base.dtypes)

NO_DECLARATION               int64
DATE_DECLARATION    datetime64[ns]
DATE_PRIOR_INSP     datetime64[ns]
EXT_FREQ                   float64
DATE_FIRST_EXT      datetime64[ns]
DATE_LAST_EXT       datetime64[ns]
HOOD_NUM                    object
HOOD_NAME                   object
BORO_NAME                   object
MTM8_X                     float64
MTM8_Y                     float64
LONGITUDE                  float64
LATITUDE                   float64
LONG_LAT                    object
MTM_X_Y                     object
dtype: object
