In [1]:
import os
import pandas as pd

In [13]:

def parse_data_filename(filename):
    filename = os.path.basename(filename)
    no_extension = filename[:-4]
    measurement, start_date, end_date = no_extension.split('_')
    return measurement

def load_files(max_files=2):
    raw_data_dir = os.path.join('..', 'raw_data')
    data_files = os.listdir(raw_data_dir)
    
    df_dict = {}
    for i, path_name in enumerate(data_files):
        # default to loading a single file
        if i > max_files - 1:
            break
        full_path = os.path.join(raw_data_dir, path_name)
        measurement = parse_data_filename(full_path)
        key = '{}&{}'.format(measurement, full_path)
        df = pd.read_csv(
                full_path, 
                dtype={'date_local': str, 
                       'latitude': float,
                       'longitude':float,
                       'arithmetic_mean': float,
                       'units_of_measure': str,
                       'sample_duration': str 
                      })
        
        # don't intake any of the 0-length csv files. 
        if len(df) == 0:
            continue
        df_dict[key] = df
            
    return df_dict
    
    
def make_summary_table(df_dict, filename='EPA_data_descriptions.csv'):
    list_of_files = []
    for key in df_dict:
        measurement, file_path = key.split('&')
        df = df_dict[key]
        append_dict = {'measurement': measurement, 
             'filename': file_path, 
             'units': df.head(1).iloc[0]['units_of_measure'], 
             'duration': df.head(1).iloc[0]['sample_duration']        
            }
        #print(append_dict)
        list_of_files.append(append_dict)
        
    summary_df = pd.DataFrame(list_of_files)
    summary_df.to_csv(filename)
    print('Saved EPA data summary csv at {}'.format(filename))
    
    print(summary_df)
        
    
df_dict = load_files(max_files=20)

make_summary_table(df_dict)

Saved EPA data summary csv at EPA_data_descriptions.csv
      measurement                                         filename  \
0     Temperature    ../raw_data/Temperature_20170101_20171231.csv   
1        Pressure       ../raw_data/Pressure_20200101_20201231.csv   
2           PM2.5          ../raw_data/PM2.5_20170101_20171231.csv   
3             NO2            ../raw_data/NO2_20200101_20201231.csv   
4              CO             ../raw_data/CO_20180101_20181231.csv   
5        Pressure       ../raw_data/Pressure_20180101_20181231.csv   
6             NO2            ../raw_data/NO2_20180101_20181231.csv   
7              CO             ../raw_data/CO_20200101_20201231.csv   
8            PM10           ../raw_data/PM10_20160101_20161231.csv   
9        Humidity       ../raw_data/Humidity_20170101_20171231.csv   
10      Windspeed      ../raw_data/Windspeed_20160101_20161231.csv   
11            SO2            ../raw_data/SO2_20190101_20191231.csv   
12  Winddirection  ../raw_data/Win

In [45]:
def examine_lat_lon(summary_file='EPA_data_descriptions.csv'):
    summary_df = pd.read_csv(summary_file)
    total_date_lat_lon = pd.DataFrame([], columns=['date', 'lat', 'lon'])
    date_lat_lon_list = []
    for row in summary_df.iterrows():
        filename = row[1]['filename']
        df = pd.read_csv(filename)
        print(df[['date_local', 'latitude', 'longitude']].nunique())
        #print(df.head(1).iloc[0][['date_local', 'latitude', 'longitude']])
        date_lat_lon_list.append(df[['date_local', 'latitude', 'longitude']])
        

    total_date_lat_lon = pd.concat(date_lat_lon_list)
    
    return total_date_lat_lon

total_date_lat_lon = examine_lat_lon()

date_local    365
latitude       57
longitude      57
dtype: int64
date_local    131
latitude       52
longitude      52
dtype: int64
date_local    365
latitude       69
longitude      69
dtype: int64
date_local    182
latitude       94
longitude      94
dtype: int64
date_local    365
latitude       69
longitude      69
dtype: int64
date_local    365
latitude       55
longitude      55
dtype: int64
date_local    365
latitude      103
longitude     103
dtype: int64
date_local    183
latitude       59
longitude      59
dtype: int64
date_local    366
latitude       79
longitude      79
dtype: int64
date_local    365
latitude       99
longitude      99
dtype: int64
date_local    366
latitude      158
longitude     158
dtype: int64
date_local    365
latitude       28
longitude      28
dtype: int64
date_local    365
latitude      164
longitude     164
dtype: int64
date_local    365
latitude      161
longitude     161
dtype: int64
date_local    366
latitude      105
longitude     105
dtype: i

In [44]:
total_date_lat_lon.nunique()

date_local    1644
latitude       255
longitude      255
dtype: int64

In [46]:

def describe_nulls(df, show_nulls_table=False):    
    print("Total rows: \n\t{:,}".format(len(df)))
    
    rows_with_any_NA = len(df[df.isnull().any(axis=1)])
    print("Rows; percent of rows that have *at least 1* missing value: \n\t{:,}; {}%".format(
        rows_with_any_NA, 100 * round(rows_with_any_NA / len(df), 2)))
    
    
    if show_nulls_table:
        nunique = df.nunique() # just compute this once.
    
        display(pd.DataFrame({
            'dtype': df.dtypes,
            'Number of nulls': df.isnull().sum(),
            '% null': 100 * round(df.isnull().sum() / len(df), 4),
            'Distinct Values': nunique,
            '% Distinct Values': 100 * round(nunique / len(df), 4),
        }))

    return



In [86]:
a = {}
a[(1,2)] = 3
print(a)

{(1, 2): 3}
