### Imports

In [None]:
import pandas as pd
import numpy as np
import re

### Import data

In [None]:
# Create a function to import the identified data files:

def import_data(dict_of_file_paths):
    def pd_read_csv(file_path):
        return pd.read_csv(file_path)
    
    dataframes = {}

    for key, file_path in dict_of_file_paths.items():
        dataframes[f'{key}_df'] = pd_read_csv(file_path)

    return dataframes

In [None]:
# Define dictionary of file paths to be passed into the import_data function

file_paths = {
    'train': '../assets/train.csv',
    'spray': '../assets/spray.csv',
    'test': '../assets/test.csv',
    'weather': '../assets/weather.csv'
}

In [None]:
# Assign output of import_data to the variable dataframes
dataframes = import_data(file_paths)

In [None]:
# Print keys of dataframes to see how many dataframes we have
dataframes.keys()

dict_keys(['train_df', 'spray_df', 'test_df', 'weather_df'])

### Checking each dataframe's info() and isnull() values

In [None]:
for each_df in dataframes.keys():
    print(f'{each_df}: ---------------------------------------------------' '\n')
    dataframes[each_df].info()
    print()
    print('++++++++++')
    print()
    print(f'Check of null values for {each_df}:')
    print(dataframes[each_df].isnull().sum())
    print()

train_df: ---------------------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date                    10506 non-null  object 
 1   Address                 10506 non-null  object 
 2   Species                 10506 non-null  object 
 3   Block                   10506 non-null  int64  
 4   Street                  10506 non-null  object 
 5   Trap                    10506 non-null  object 
 6   AddressNumberAndStreet  10506 non-null  object 
 7   Latitude                10506 non-null  float64
 8   Longitude               10506 non-null  float64
 9   AddressAccuracy         10506 non-null  int64  
 10  NumMosquitos            10506 non-null  int64  
 11  WnvPresent              10506 non-null  int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 985.1+ KB

++++++++++

Check of 

In [None]:
# Thus far only the Time column in spray_df is showing 584 null values.
# However from weather.csv we know that there are '-' and 'M' which are missing values
dataframes['weather_df'] = pd.read_csv('../assets/weather.csv', na_values=['-', 'M'])

In [None]:
dataframes['weather_df'].isnull().sum()

Station           0
Date              0
Tmax              0
Tmin              0
Tavg             11
Depart         1472
DewPoint          0
WetBulb           4
Heat             11
Cool             11
Sunrise        1472
Sunset         1472
CodeSum           0
Depth          1472
Water1         2944
SnowFall       1472
PrecipTotal       2
StnPressure       4
SeaLevel          9
ResultSpeed       0
ResultDir         0
AvgSpeed          3
dtype: int64

### Based on this initial import and checks:  
* train_df: no null values  
* test_df: no null values  
* spray_df:  
    * Time: 584 null
* weather_df:
    * Tavg, Heat, Cool: 11 null
    * Depart, Sunrise, Sunset, Depth, SnowFall: 1472 null
    * WetBulb, StnPressure: 4 null
    * PrecipTotal: 2 null
    * SeaLevel: 9 null
    * AvgSpeed: 3 null


---
### EDA with train_df

In [None]:
train_df = dataframes['train_df']
train_df

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10501,2013-09-26,"5100 West 72nd Street, Chicago, IL 60638, USA",CULEX PIPIENS/RESTUANS,51,W 72ND ST,T035,"5100 W 72ND ST, Chicago, IL",41.763733,-87.742302,8,6,1
10502,2013-09-26,"5800 North Ridge Avenue, Chicago, IL 60660, USA",CULEX PIPIENS/RESTUANS,58,N RIDGE AVE,T231,"5800 N RIDGE AVE, Chicago, IL",41.987280,-87.666066,8,5,0
10503,2013-09-26,"1700 North Ashland Avenue, Chicago, IL 60622, USA",CULEX PIPIENS/RESTUANS,17,N ASHLAND AVE,T232,"1700 N ASHLAND AVE, Chicago, IL",41.912563,-87.668055,9,1,0
10504,2013-09-26,"7100 North Harlem Avenue, Chicago, IL 60631, USA",CULEX PIPIENS/RESTUANS,71,N HARLEM AVE,T233,"7100 N HARLEM AVE, Chicago, IL",42.009876,-87.807277,9,5,0
