In [28]:
import os
import pandas as pd

In [95]:

def parse_data_filename(filename):
    no_extension = filename[:-4]
    measurement, start_date, end_date = no_extension.split('_')
    return measurement

def load_files(max_files=2):
    raw_data_dir = os.path.join(os.path.dirname(os.getcwd()), 'raw_data')
    data_files = os.listdir(raw_data_dir)
    
    df_dict = {}
    for i, path_name in enumerate(data_files):
        # default to loading a single file
        if i > max_files - 1:
            break
        full_path = os.path.join(raw_data_dir, s)
        measurement = parse_data_filename(s)
        key = '{}&{}'.format(measurement, path_name)
        df = pd.read_csv(
                full_path, 
                dtype={'date_local': str, 
                       'latitude': float,
                       'longitude':float,
                       'arithmetic_mean': float,
                       'units_of_measure': str,
                       'sample_duration': str 
                      })
        
        # don't intake any of the 0-length csv files. 
        if len(df) == 0:
            continue
        df_dict[key] = df
            
    return df_dict
    
    
def make_summary_table(df_dict, filename='EPA_data_descriptions.csv'):
    list_of_files = []
    for key in df_dict:
        measurement, file_path = key.split('&')
        df = df_dict[key]
        append_dict = {'measurement': measurement, 
             'filename': file_path, 
             'units': df.head(1).iloc[0]['units_of_measure'], 
             'duration': df.head(1).iloc[0]['sample_duration']        
            }
        #print(append_dict)
        list_of_files.append(append_dict)
        
    summary_df = pd.DataFrame(list_of_files)
    summary_df.to_csv(filename)
    print('Saved EPA data summary csv at {}'.format(filename))
    
    print(summary_df)
        
    
df_dict = load_files(max_files=20)

# for key in df_dict:
#     print(key)

make_summary_table(df_dict)



Saved EPA data summary csv at EPA_data_descriptions.csv
   measurement                             filename  \
0        PM2.5    Temperature_20170101_20171231.csv   
1        PM2.5       Pressure_20200101_20201231.csv   
2        PM2.5          PM2.5_20170101_20171231.csv   
3        PM2.5            NO2_20200101_20201231.csv   
4        PM2.5             CO_20180101_20181231.csv   
5        PM2.5       Pressure_20180101_20181231.csv   
6        PM2.5            NO2_20180101_20181231.csv   
7        PM2.5             CO_20200101_20201231.csv   
8        PM2.5           PM10_20160101_20161231.csv   
9        PM2.5       Humidity_20170101_20171231.csv   
10       PM2.5      Windspeed_20160101_20161231.csv   
11       PM2.5            SO2_20190101_20191231.csv   
12       PM2.5  Winddirection_20190101_20191231.csv   
13       PM2.5             O3_20160101_20161231.csv   
14       PM2.5             O3_20170101_20171231.csv   
15       PM2.5  Winddirection_20180101_20181231.csv   
16       

In [46]:

def describe_nulls(df, show_nulls_table=False):    
    print("Total rows: \n\t{:,}".format(len(df)))
    
    rows_with_any_NA = len(df[df.isnull().any(axis=1)])
    print("Rows; percent of rows that have *at least 1* missing value: \n\t{:,}; {}%".format(
        rows_with_any_NA, 100 * round(rows_with_any_NA / len(df), 2)))
    
    
    if show_nulls_table:
        nunique = df.nunique() # just compute this once.
    
        display(pd.DataFrame({
            'dtype': df.dtypes,
            'Number of nulls': df.isnull().sum(),
            '% null': 100 * round(df.isnull().sum() / len(df), 4),
            'Distinct Values': nunique,
            '% Distinct Values': 100 * round(nunique / len(df), 4),
        }))

    return



In [86]:
a = {}
a[(1,2)] = 3
print(a)

{(1, 2): 3}
