In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Data/552/'

Mounted at /content/drive


In [None]:
import pandas as pd
import re
import math
import numpy as np
from scipy import stats


In [None]:
## Polution levels for date and grid id
training_set = pd.read_csv(path+'train_labels.csv')
training_set.head(3)

Unnamed: 0,datetime,grid_id,value
0,2018-02-01T08:00:00Z,3S31A,11.4
1,2018-02-01T08:00:00Z,A2FBI,17.0
2,2018-02-01T08:00:00Z,DJN0F,11.1


In [None]:
training_set[training_set['grid_id'] == '3S31A']

Unnamed: 0,datetime,grid_id,value
0,2018-02-01T08:00:00Z,3S31A,11.400000
49,2018-02-03T08:00:00Z,3S31A,27.200000
79,2018-02-04T08:00:00Z,3S31A,19.844444
110,2018-02-05T08:00:00Z,3S31A,10.600000
141,2018-02-06T08:00:00Z,3S31A,20.300000
...,...,...,...
34129,2020-12-27T08:00:00Z,3S31A,5.818519
34162,2020-12-28T08:00:00Z,3S31A,3.038889
34199,2020-12-29T08:00:00Z,3S31A,8.125397
34236,2020-12-30T08:00:00Z,3S31A,10.889474


In [None]:
#meta data for grid id
metadata = pd.read_csv(path+'grid_metadata.csv')

In [None]:
metadata_taipei = metadata[metadata['location'] == 'Taipei']
metadata_delhi = metadata[metadata['location'] == 'Delhi']
metadata_la = metadata[metadata['location'] == 'Los Angeles (SoCAB)']

In [None]:
metadata_la.head(3)

Unnamed: 0,grid_id,location,tz,wkt
2,3S31A,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.9338248256995 33.79558357488509...
11,A2FBI,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.3948356552278 33.98201108613195...
18,DHO4M,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-118.3380667035533 34.16803061743935...


In [None]:
assert (len(metadata_taipei)+len(metadata_delhi)+len(metadata_la)) == len(metadata)

In [None]:
def parse_polygon_coords(location):
    '''Parse longitude and latitude from string
    Args: 
        location: location string
    Returns: 
        list of (lat,long) polygon vertices
    '''
    coordinates = re.findall('\d*\.?\d+',location)
    coordinates = [(float(coordinates[i]), float(coordinates[i+1])) for i in range(0,len(coordinates)-1,2)]
    return coordinates

In [None]:
def get_centroid(pc):
    '''
    https://en.wikipedia.org/wiki/Centroid#Of_a_polygon
    Args: 
        pc: list of tuples of polygon vertices
    Return: 
        (long,lat) of polygon center coordinates
    '''
    A = 0
    for i in range(len(pc)-1):
        a = pc[i][0]*pc[i+1][1] - pc[i+1][0]*pc[i][1] 
        A += a
    A *= 0.5
    Cx, Cy = 0,0
    for i in range(len(pc)-1):
        Cx += (pc[i][0]+pc[i+1][0])*(pc[i][0]*pc[i+1][1] - pc[i+1][0]*pc[i][1])
        Cy += (pc[i][1]+pc[i+1][1])*(pc[i][0]*pc[i+1][1] - pc[i+1][0]*pc[i][1])
    Cx /= 6*A
    Cy /= 6*A
    return (Cx, Cy)


def get_latitude_longitude(data):
    coordinates = [parse_polygon_coords(row) for row in data['wkt']]
    latitudes, longitudes = [], []
    for c in coordinates: 
        center = get_centroid(c)
        latitudes.append(center[1]), longitudes.append(center[0])
    data['Latitudes'] = latitudes
    data['Longitudes'] = longitudes
    return data

In [None]:
metadata = get_latitude_longitude(metadata)

In [None]:
metadata.to_csv(path+'metadata_updated.csv')

In [None]:
metadata

Unnamed: 0,grid_id,location,tz,wkt,Latitudes,Longitudes
0,1X116,Taipei,Asia/Taipei,"POLYGON ((121.5257644471362 24.97766123020391,...",24.998015,121.503307
1,1Z2W7,Delhi,Asia/Calcutta,"POLYGON ((77.30453178416276 28.54664454217707,...",28.566368,77.282074
2,3S31A,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.9338248256995 33.79558357488509...,33.814243,117.956283
3,6EIL6,Delhi,Asia/Calcutta,"POLYGON ((77.07995296313287 28.54664454217707,...",28.566368,77.057495
4,7334C,Delhi,Asia/Calcutta,"POLYGON ((77.12486872733885 28.54664454217707,...",28.566368,77.102411
5,78V83,Delhi,Asia/Calcutta,"POLYGON ((76.94520567051495 28.54664454217707,...",28.566368,76.922748
6,7F1D1,Delhi,Asia/Calcutta,"POLYGON ((77.12486872733885 28.58609243100243,...",28.605809,77.102411
7,8KNI6,Delhi,Asia/Calcutta,"POLYGON ((77.30453178416276 28.46770443564941,...",28.487443,77.282074
8,90BZ1,Taipei,Asia/Taipei,"POLYGON ((121.5706802113421 25.01836939334328,...",25.038717,121.548222
9,90S79,Delhi,Asia/Calcutta,"POLYGON ((77.21470025575081 28.62552552598286,...",28.645235,77.192242


In [None]:
coords

[(117.9338248256995, 33.79558357488509),
 (117.9338248256995, 33.83290166381627),
 (117.9787405899055, 33.83290166381627),
 (117.9787405899055, 33.79558357488509),
 (117.9338248256995, 33.79558357488509)]

## Weather

In [None]:
#documentation: https://www.ncei.noaa.gov/metadata/geoportal/rest/metadata/item/gov.noaa.ncdc%3AC00532/html
#sample weather data for Burbank, LA
df = pd.read_csv(path+'test.csv')


  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
df

Unnamed: 0,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,...,MV1,MW1,MW2,OC1,OD1,OE1,OE2,OE3,REM,EQD
0,72288023152,2020-01-01T00:53:00,7,34.20056,-118.3575,225.9,"BURBANK GLENDALE PASADENA AIRPORT, CA US",FM-15,KBUR,V020,...,,,,,,,,,MET09612/31/19 16:53:03 METAR KBUR 010053Z 290...,
1,72288023152,2020-01-01T01:53:00,7,34.20056,-118.3575,225.9,"BURBANK GLENDALE PASADENA AIRPORT, CA US",FM-15,KBUR,V020,...,,,,,,,,,MET09612/31/19 17:53:03 METAR KBUR 010153Z 340...,
2,72288023152,2020-01-01T02:53:00,7,34.20056,-118.3575,225.9,"BURBANK GLENDALE PASADENA AIRPORT, CA US",FM-15,KBUR,V020,...,,,,,,,,,MET10212/31/19 18:53:03 METAR KBUR 010253Z 360...,
3,72288023152,2020-01-01T03:53:00,7,34.20056,-118.3575,225.9,"BURBANK GLENDALE PASADENA AIRPORT, CA US",FM-15,KBUR,V020,...,,,,,,,,,MET09612/31/19 19:53:03 METAR KBUR 010353Z 320...,
4,72288023152,2020-01-01T04:53:00,7,34.20056,-118.3575,225.9,"BURBANK GLENDALE PASADENA AIRPORT, CA US",FM-15,KBUR,V020,...,,,,,,,,,MET09512/31/19 20:53:03 METAR KBUR 010453Z 350...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10180,72288023152,2020-12-31T19:53:00,7,34.20056,-118.3575,225.9,"BURBANK GLENDALE PASADENA AIRPORT, CA US",FM-15,KBUR,V020,...,,,,01035,,,,,MET09912/31/20 11:53:02 METAR KBUR 311953Z 320...,
10181,72288023152,2020-12-31T20:53:00,7,34.20056,-118.3575,225.9,"BURBANK GLENDALE PASADENA AIRPORT, CA US",FM-15,KBUR,V020,...,,,,01035,,,,,MET10712/31/20 12:53:02 METAR KBUR 312053Z 340...,
10182,72288023152,2020-12-31T21:53:00,7,34.20056,-118.3575,225.9,"BURBANK GLENDALE PASADENA AIRPORT, CA US",FM-15,KBUR,V020,...,,,,00885,,,,,MET10112/31/20 13:53:02 METAR KBUR 312153Z 350...,
10183,72288023152,2020-12-31T22:53:00,7,34.20056,-118.3575,225.9,"BURBANK GLENDALE PASADENA AIRPORT, CA US",FM-15,KBUR,V020,...,,,,01135,,,,,MET10112/31/20 14:53:02 METAR KBUR 312253Z 340...,


In [None]:
def clean_weather_data_columns(data, remove_cols):
    #TO-DO: Explore what the columns mean and what can be removed, 
    # and if columns with much missing data are important
    '''
    Removes irrelevant columns and columns with many missing values
    for historical weather dataset. 
    Args: 
        data: weather dataframe
        remove_cols: columns to remove
    Returns: 
        Cleaned dataframe
    '''
    data = data.drop(remove_cols, axis = 1)
    to_drop = []
    for col in data:
        if data[col].isna().sum() > 0.7 * len(col):
            data = data.drop(to_drop, axis = 1)
    return data

In [None]:
remove_cols = ['STATION', 'SOURCE', 'NAME', 'REPORT_TYPE', 'CALL_SIGN', 'QUALITY_CONTROL']
df = clean_weather_data_columns(df, remove_cols)

In [None]:
def group_dates(data):
    '''
    Weather data contains hourly samples for each day. Group samples
    into a dict of dataframe of hourly samples for each day. 
    Args: 
        data: weather dataframe
    Returns: 
        dict of weather dataframes with (key, val) = (date, dataframe) 
    '''
    date_indices = {}
    data = data.sort_values(by = ['DATE'])
    dates = data['DATE']
    current_date = dates.iloc[0][0:10]
    date_indices[current_date] = [0]
    #group data from same date into date-keyed dict with list
    # of corresponding indices in original df
    for i,row in enumerate(dates):
        date = row[0:10]
        if date != current_date:
            current_date = date
            date_indices[date] = []
        date_indices[date].append(i) 
    date_grouped_dfs = {}
    #for each date, slice the original df to date-keyed dict of dfs
    for key, value in date_indices.items():
        date_df = data.iloc[value]
        date_df = date_df.reset_index(drop = True)
        date_grouped_dfs[key] = date_df
    return date_grouped_dfs
        

In [None]:
dfa = group_dates(df)
df2 = dfa['2020-01-01']

In [None]:
df_dict = group_dates(df)

Documentation for data: 
    Documentation of NOAA data:  
    https://www.ncei.noaa.gov/metadata/geoportal/rest/metadata/item/gov.noaa.ncdc%3AC00532/html

Contains info on columns and how they should be interpreted

In [None]:
def clean_wind_data(data):
    '''
    Parses wind data string into relevant fields. Creates new columns in weather dataframe
    Removes samples with corrupted wind data, or missing wind data. 
    'Wind direction: 0-360 degs
    'Wind speed: m/s
    '''
    wind_direction, wind_speed = [], []
    acceptable_quality = (0,1,4,5,9) #see docs
    missing_dir, missing_spd = 999, 9999
    for i,values in enumerate(data['WND']):
        values = values.split(",")
        direction, direction_quality = int(values[0]), int(values[1])
        speed, speed_quality = int(values[3]), int(values[4])
        #ID erroneuous or missing.
        if (direction_quality not in acceptable_quality or direction == missing_dir):
            direction = np.nan
        if (speed_quality not in acceptable_quality or speed == missing_spd):
            speed = np.nan
        wind_direction.append(direction), wind_speed.append(speed)
    data['Wind_Speed'] = wind_speed
    data['Wind_Direction'] = wind_direction
    return data

In [None]:
def clean_ceiling_height_data(data):
    '''
    The height above ground level (AGL) of the lowest cloud or obscuring
    phenomena layer aloft with 5/8 or more summation total sky cover,
    which may be predominantly opaque, or the vertical visibility into a
    surface-based obstruction. Unlimited = 22000.
    height (m) above ground level of lowest cloud (unlimited = 22000)
    '''
    ceiling_height = []
    acceptable_quality = (0,1,4,5,9) #see docs
    missing = 99999
    for i,cig in enumerate(data['CIG']):
        cig = cig.split(",")
        height = int(cig[0])
        quality = int(cig[1])
        #ID erroneuous or missing. 
        if (quality not in acceptable_quality or height == missing):
            height = np.nan
        ceiling_height.append(height)
    data['Cloud_Height'] = ceiling_height
    return data

In [None]:
def clean_visibility_data(data):
    '''The horizontal distance at which an object can be seen and identified. (meters)
    '''
    visilibilites = []
    acceptable_quality = (0,1,4,5,9) #see docs
    missing = 999999
    for i,vis in enumerate(data['VIS']):
        vis = vis.split(',')
        visibility = int(vis[0])
        quality = int(vis[1])
        #ID erroneuous or missing. 
        if (quality not in acceptable_quality or visibility == missing):
            visibility = np.nan
        visilibilites.append(visibility)
    data['Visibility'] = visilibilites
    return data

In [None]:
def clean_temperature_data(data): 
    '''Air Temperature data in C'''
    temperatures = []
    missing = 9999
    acceptable_quality = ('0','1','4','5','9','C','I','M','P','R','U') #see docs
    for i,sample in enumerate(data['TMP']):
        sample = sample.split(',')
        temperature = int(sample[0])
        quality = sample[1]
        #ID erroneuous or missing. 
        if (quality not in acceptable_quality or temperature == missing):
            temperature = np.nan
        temperatures.append(temperature / 10)
    data['Temperature'] = temperatures
    return data


In [None]:
def clean_pressure_data(data): 
    '''The air pressure relative to Mean Sea Level (MSL).
    (Hectopascals)'''
    pressures = []
    missing = 99999
    acceptable_quality = ('0','1','4','5','9') #see docs
    for i,sample in enumerate(data['SLP']):
        sample = sample.split(',')
        pressure = int(sample[0])
        quality = sample[1]
        #ID erroneuous or missing. 
        if (quality not in acceptable_quality or pressure == missing):
            pressure = np.nan
        pressures.append(pressure)
    data['Atmospheric_Pressure'] = pressures
    return data


In [None]:
def clean_dew_point_data(data): 
    '''The temperature to which a given parcel of air must be cooled
     at constant pressure and water vapor content in order for saturation to occur. (C)'''
    dew_points = []
    missing = 9999
    acceptable_quality = ('0','1','4','5','9','C','I','M','P','R','U') #see docs
    for i,sample in enumerate(data['DEW']):
        sample = sample.split(',')
        dp = int(sample[0])
        quality = sample[1]
        #ID erroneuous or missing. 
        if (quality not in acceptable_quality or dp == missing):
            dp = np.nan
        dew_points.append(dp)
    data['Dew_Point'] = dew_points
    return data

In [None]:
def clean_precipitation_data(data, event_number): 
    '''episode of LIQUID-PRECIPITATION.
    - The quantity of time over which the LIQUID-PRECIPITATION was measured. (hours)
    - The depth of LIQUID-PRECIPITATION that is measured at the time of an observation. (mm)
    Note that there data contains AA1-AA3 fields for multiple precipitation events
    '''
    times, depths = [], []
    missing_depth, missing_time = 9999, 99
    acceptable_quality = ('0','1','4','5','9','C','I','M','P','R','U') #see docs
    event_column = f'AA{event_number}'
    for i,sample in enumerate(data[event_column]):
        if isinstance(sample, str):
            sample = sample.split(',')
            time = int(sample[0])
            depth = int(sample[1])
            quality = sample[-1]
        else: 
            time = depth = quality = np.nan
        #ID erroneuous or missing. 
        if (quality not in acceptable_quality):
            depth = time = np.nan
        if (depth == missing_depth):
            depth = np.nan
        if (time == missing_time):
            time = np.nan
        depths.append(depth),  times.append(time)
    data[f'Precipitation_Duration_{event_number}'] = times
    data[f'Precipitation_Depth_{event_number}'] = depths
    return data


In [None]:
def clean_sky_cover_data(data, event_number): 
    '''SKY-COVER-LAYER..
    - Field 1: The code that denotes the fraction of the total celestial dome covered by a SKY-COVER-LAYER.
    - Field 2: SKY-COVER-LAYER base height dimension
    - Field 3: The code that denotes the classification of the clouds that comprise a SKY-COVER-LAYER.
    Note that there data contains GA1-GA6 fields for multiple cloud layers
    GA2-GA3 are 89+% nan (no secondary cloud covered), so these are ignored 
    TO-DO: Check above statement on total dataset
    T0-DO: compare cloud cover fields and choose appropriate one(s) (GA1, GD1, GF1, )
    '''
    covers, height, clouds = [], [], []
    acceptable_quality = ('0','1','4','5','9','M') #see docs
    missing_cover = missing_cloud_type = 99
    missing_base_height = 99999
    #convert octas (or code) to coverage fraction
    conversion_values = {0:0, 1:0.1, 2:0.25, 3:0.4, 4:0.5, 5:0.6, 6:0.75, 7:0.95, 8:1.0, 9:np.nan, 10:np.nan, 99:np.nan}
    event_column = f'GA{event_number}'
    for i,sample in enumerate(data[event_column]):
        if isinstance(sample, str):
            sample = sample.split(',')
            coverage, coverage_quality = int(sample[0]), sample[1] #fraction
            coverage = conversion_values[coverage]
            base_height, base_height_quality = int(sample[2]), sample[3]
            cloud_type, cloud_type_quality = int(sample[4]), sample[5]
        else: 
            coverage = base_height = cloud_type = coverage_quality = base_height_quality = cloud_type_quality = np.nan
        if (coverage_quality not in acceptable_quality or coverage == missing_cover):
            coverage = np.nan
        if (base_height_quality not in acceptable_quality or base_height == missing_base_height):
            base_height = np.nan
        if (cloud_type_quality not in acceptable_quality or cloud_type == missing_cloud_type):
            cloud_type = np.nan
        covers.append(coverage),  height.append(base_height), clouds.append(cloud_type)
    data[f'Cloud_Coverage_{event_number}'] = covers
    data[f'Cloud_Base_Height_{event_number}'] = height
    data[f'Cloud_Type_Code_{event_number}'] = clouds
    return data


def clean_sky_condition_observation(data):
    coverage, opaque_coverage, lowest_cloud_cover, low_clouds, mid_clouds, high_clouds = [],[],[],[],[],[]
    acceptable_quality = (0, 1, 4, 5, 9)
    missing_coverage, missing_cloud_genus, missing_height = 99, 99, 99999
    conversion_values = {0:0, 1:0.1, 2:0.25, 3:0.4, 4:0.5, 5:0.6, 6:0.75, 7:0.95, 8:1.0, 9:np.nan, 10:np.nan,
                        11:0.3, 13:0.3, 14:0.4, 15:0.5, 16:0.6, 17:0.8, 18:0.9, 19:1.0, 99:np.nan}
    for i,sample in enumerate(data['GF1']):
        if isinstance(sample, str):
            sample = sample.split(',')
            total_coverage, total_coverage_quality = int(sample[0]), sample[2]
            total_coverage = conversion_values[total_coverage]
            total_opaque_coverage = int(sample[1])
            total_opaque_coverage = conversion_values[total_opaque_coverage]
            total_lowest_cloud_cover, total_lowest_cloud_cover_quality = int(sample[3]), int(sample[4])
            total_lowest_cloud_cover = conversion_values[total_lowest_cloud_cover]
            low_cloud_genus, low_cloud_genus_quality = int(sample[5]), int(sample[6])
            lowest_cloud_height, lowest_cloud_height_quality = sample[7], int(sample[8])
            mid_cloud_genus, mid_cloud_genus_quality = int(sample[9]), int(sample[10])
            high_cloud_genus, high_cloud_genus_quality = int(sample[11]), int(sample[12])
        else: 
            total_coverage = total_opaque_coverage = total_lowest_cloud_cover = low_cloud_genus = np.nan
            mid_cloud_genus = lowest_cloud_height = high_cloud_genus = np.nan
            total_coverage_quality = total_lowest_cloud_cover_quality = low_cloud_genus_quality = np.nan
            mid_cloud_genus_quality = lowest_cloud_height_quality = high_cloud_genus_quality = np.nan
        if (total_coverage_quality not in acceptable_quality or total_coverage == missing_coverage):
            total_coverage = np.nan
        if (total_lowest_cloud_cover_quality not in acceptable_quality or total_lowest_cloud_cover == missing_coverage):
            total_coverage = np.nan
        if (low_cloud_genus_quality not in acceptable_quality or low_cloud_genus == missing_coverage):
            low_cloud_genus = np.nan
        if (mid_cloud_genus_quality not in acceptable_quality or mid_cloud_genus == missing_coverage):
            mid_cloud_genus = np.nan
        if (high_cloud_genus_quality not in acceptable_quality or high_cloud_genus == missing_coverage):
            high_cloud_genus = np.nan
        coverage.append(total_coverage), opaque_coverage.append(total_opaque_coverage)
        lowest_cloud_cover.append(total_lowest_cloud_cover), low_clouds.append(low_cloud_genus)
        mid_clouds.append(mid_cloud_genus), high_clouds.append(high_cloud_genus)

    data['Total_Coverage'] = coverage
    data['Total_Opaque_Coverage'] = total_opaque_coverage
    data['Lowest_Cloud_Height'] = lowest_cloud_cover
    data['Low_Cloud_Genus'] = low_clouds
    data['Mid_Cloud_Genus'] = mid_clouds
    data['High_Cloud_Genus'] = high_clouds

    return data

In [None]:
def clean_date(data):
    data['Datetime'] = pd.to_datetime(data['DATE'])
    return data

In [None]:

def previous_and_next(some_iterable):
    prevs, items, nexts = tee(some_iterable, 3)
    prevs = chain([None], prevs)
    nexts = chain(islice(nexts, 1, None), [None])
    return izip(prevs, items, nexts)

In [None]:
a = pd.DataFrame({'a':[1,2,2,2,3,4]})
stats.mode(a)[1][0][0]

3

In [None]:
from pandas.core.internals.blocks import ensure_block_shape
def get_wind_vector(df):
    '''
    Args: 
        wind_data: dataframe with wind direction and speed cols
    Returns: 
        tuple of mean x,y wind vectors
    '''
    #data is calibrated to north as 0 deg ?
    wind_data = df[['Wind_Direction', 'Wind_Speed']]
    wind_vect_x, wind_vect_y = [], []
    for dir,speed in wind_data.itertuples(index=False):
        #degrees to vectors
        wind_vect_x.append(np.cos(dir + 90) * speed) #positive = west
        wind_vect_y.append(np.sin(dir + 90) * speed) #positive = north
    df['Wind_X'] = wind_vect_x    
    df['Wind_Y'] = wind_vect_y
    return df

def mean_remove_nan(series):
    array = np.array(series)
    array = array[~np.isnan(array)]
    mean = np.mean(array)
    return mean

def time_weighted_aggregate(field, day_minus1, day, day_plus1, center, radius, method = 'mean'):
    argcenter = np.argmin([(center - time).seconds / 3600 for time in day['Datetime']])
    day = day.sort_values(by='Datetime', ascending=True)
    #handle edge case (trim radius to start of day)
    if day_minus1 is None or day_plus1 is None:
        radius1 = argcenter
        radius2 = len(day) - argcenter
        radius = np.min([radius1, radius2])
        times = list(day['Datetime'])
        values = list([day[field]])
    else: 
        day_minus1 = day_plus1.sort_values(by='Datetime', ascending=True)
        day_plus1 = day_plus1.sort_values(by='Datetime', ascending=True)
        times = list(day_minus1['Datetime']) + list(day['Datetime']) + list(day_plus1['Datetime'])
        values = list(day_minus1[field]) + list(day[field]) + list(day_plus1[field])
        argcenter += len(day_minus1[field])
    #get weight of time difference between center and neighbours
    values = list(np.array(values).flatten())
    timedeltas = [((times[i] - center).seconds / 3600) for i in range(argcenter-radius, argcenter+radius)]
    timedelta_weights = np.array([float(dt)/sum(timedeltas) for dt in timedeltas])
    values = np.array(values[argcenter-radius : argcenter+radius])
    timedelta_weights = timedelta_weights[~np.isnan(values)]
    #remove nan
    values = values[~np.isnan(values)]
    if method == 'mode':
        aggregate = stats.mode(values)[0]
    else:
        aggregate = sum(timedelta_weights*values)
    return aggregate


def remove_nan_and_aggregate(series, method = 'mean'):
    series = series[~np.isnan(series)]
    if method == 'mode':
        aggregate = stats.mode(a)[1][0][0]
    else: 
        aggregate = series.mean()
    return aggregate




def aggregate_daily_station_data(df_dict): 
    '''Takes daily weather data dict and returns a time weighted aggregate dataframe for that weather station
    Args: 
        df_dict: dict of daily weather dataframes
    Returns: 
        Pandas dataframe of daily averages 
    '''
    avg_dict = {
        'Date': [], 
        'Wind_X': [],
        'Wind_Y': [],
        'Cloud_Height':[],
        'Temperature':[],
        'Visibility':[],
        'Atmospheric_Pressure':[],
        'Dew_Point':[],
        'Precipitation_Duration_1':[],
        'Precipitation_Depth_1':[],
        'Precipitation_Duration_2':[],
        'Precipitation_Depth_2':[],
        'Precipitation_Duration_3':[],
        'Precipitation_Depth_3':[],
        'Cloud_Coverage_1':[], 
        'Cloud_Base_Height_1':[], 
        'Cloud_Type_Code_1':[], 
        'Cloud_Coverage_2':[], 
        'Cloud_Base_Height_2':[], 
        'Cloud_Type_Code_2':[], 
        'Total_Coverage':[], 
        'Total_Opaque_Coverage':[], 
        'Lowest_Cloud_Height':[], 
        'Low_Cloud_Genus':[], 
        'Mid_Cloud_Genus':[], 
        'High_Cloud_Genus':[], 
    }

    for _, df in df_dict.items():
        df = clean_date(df)
        df = clean_wind_data(df)
        df = get_wind_vector(df)
        df = clean_ceiling_height_data(df)
        df = clean_visibility_data(df)
        df = clean_temperature_data(df)
        df = clean_pressure_data(df)
        df = clean_dew_point_data(df)
        df = clean_precipitation_data(df, event_number = 1)
        df = clean_precipitation_data(df, event_number = 2)
        df = clean_precipitation_data(df, event_number = 3)
        df = clean_sky_cover_data(df, event_number = 1)
        df = clean_sky_cover_data(df, event_number = 2)
        df = clean_sky_condition_observation(df)

    for current in df_dict:
    
        df = df_dict[current]

        avg_dict['Date'].append(current)

        wnd_x = remove_nan_and_aggregate(df['Wind_X'])
        wnd_y = remove_nan_and_aggregate(df['Wind_Y'])
        avg_dict['Wind_X'].append(wnd_x)
        avg_dict['Wind_Y'].append(wnd_y)

        height = remove_nan_and_aggregate(df['Cloud_Height'])
        avg_dict['Cloud_Height'].append(height)

        vis = remove_nan_and_aggregate(df['Visibility'])
        avg_dict['Visibility'].append(vis)

        temp = remove_nan_and_aggregate(df['Temperature'])
        avg_dict['Temperature'].append(temp)
          
        pressure = remove_nan_and_aggregate(df['Atmospheric_Pressure'])
        avg_dict['Atmospheric_Pressure'].append(pressure)

        dp = remove_nan_and_aggregate(df['Dew_Point'])
        avg_dict['Dew_Point'].append(dp)
        
        for e in range(1,4):
            dur = remove_nan_and_aggregate(df[f'Precipitation_Duration_{e}'])
            depth = remove_nan_and_aggregate(df[f'Precipitation_Depth_{e}'])
            avg_dict[f'Precipitation_Depth_{e}'].append(depth), avg_dict[f'Precipitation_Duration_{e}'].append(dur)

        for e in range(1,3):
            cover = remove_nan_and_aggregate(df[f'Cloud_Coverage_{e}'])
            base_height = remove_nan_and_aggregate(df[f'Cloud_Base_Height_{e}'])
            cloud_code = remove_nan_and_aggregate(df[f'Cloud_Type_Code_{e}'], method = 'mode')
            avg_dict[f'Cloud_Coverage_{e}'].append(cover), avg_dict[f'Cloud_Base_Height_{e}'].append(base_height)
            avg_dict[f'Cloud_Type_Code_{e}'].append(cloud_code)

        total_coverage = remove_nan_and_aggregate(df['Total_Coverage'])
        avg_dict['Total_Coverage'].append(total_coverage)
        toc = remove_nan_and_aggregate(df['Total_Opaque_Coverage'])
        avg_dict['Total_Opaque_Coverage'].append(toc)
        lch = remove_nan_and_aggregate(df['Lowest_Cloud_Height'])
        avg_dict['Lowest_Cloud_Height'].append(lch)
        lcg = remove_nan_and_aggregate(df['Low_Cloud_Genus'], method = 'mode')
        avg_dict['Low_Cloud_Genus'].append(lcg)
        mcg = remove_nan_and_aggregate(df['Mid_Cloud_Genus'], method = 'mode')
        avg_dict['Mid_Cloud_Genus'].append(mcg)  
        hcg = remove_nan_and_aggregate(df['High_Cloud_Genus'], method = 'mode')
        avg_dict['High_Cloud_Genus'].append(hcg)

    return pd.DataFrame(avg_dict)


In [None]:

aggregate_daily_station_data(df_dict)


Unnamed: 0,Date,Wind_X,Wind_Y,Cloud_Height,Temperature,Visibility,Atmospheric_Pressure,Dew_Point,Precipitation_Duration_1,Precipitation_Depth_1,...,Cloud_Type_Code_1,Cloud_Coverage_2,Cloud_Base_Height_2,Cloud_Type_Code_2,Total_Coverage,Total_Opaque_Coverage,Lowest_Cloud_Height,Low_Cloud_Genus,Mid_Cloud_Genus,High_Cloud_Genus
0,2020-01-01,-10.142736,1.165113,22000.000000,13.020000,16028.640000,10162.560000,-9.560000,1.884615,0.000000,...,3,,,3,,,,3,3,3
1,2020-01-02,5.559220,-0.274206,22000.000000,13.654167,16093.000000,10125.250000,56.125000,1.920000,0.000000,...,3,,,3,,,,3,3,3
2,2020-01-03,4.642200,-0.051585,22000.000000,13.729167,16093.000000,10198.791667,40.458333,1.920000,0.000000,...,3,,,3,,,,3,3,3
3,2020-01-04,-16.752797,-8.278106,22000.000000,13.058333,16093.000000,10251.166667,35.791667,1.920000,0.000000,...,3,,,3,,,,3,3,3
4,2020-01-05,4.719774,-4.243758,22000.000000,12.972000,16093.000000,10240.916667,21.560000,1.920000,0.000000,...,3,,,3,,,,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2020-12-27,5.682720,-5.133375,12770.526316,11.742105,9041.684211,10149.375000,79.578947,1.920000,0.000000,...,3,,,3,,,,3,3,3
362,2020-12-28,-7.536464,3.189901,1612.637931,9.994828,13082.534483,10105.833333,73.465517,1.511111,10.311111,...,3,0.916327,950.571429,3,,,,3,3,3
363,2020-12-29,-14.852677,-3.134367,16299.680000,8.487500,15835.520000,10158.272727,25.625000,1.884615,10.038462,...,3,0.816667,1615.666667,3,,,,3,3,3
364,2020-12-30,-2.426715,-1.401307,22000.000000,11.184000,16093.000000,10216.541667,-42.000000,1.920000,0.000000,...,3,,,3,,,,3,3,3


In [None]:
df_dict['2020-01-03']['WND']

0     999,9,C,0000,5
1     999,9,C,0000,5
2     360,5,N,0015,5
3     999,9,C,0000,5
4     999,9,C,0000,5
5     999,9,C,0000,5
6     999,9,C,0000,5
7     999,9,C,0000,5
8     999,9,9,9999,9
9     999,9,C,0000,5
10    999,9,C,0000,5
11    999,9,C,0000,5
12    999,9,C,0000,5
13    360,5,N,0021,5
14    999,9,C,0000,5
15    180,5,N,0021,5
16    999,9,C,0000,5
17    999,9,C,0000,5
18    250,5,N,0021,5
19    200,5,N,0015,5
20    999,9,C,0000,5
21    200,5,N,0021,5
22    999,9,V,0026,5
23    190,5,N,0026,5
24    180,5,N,0031,5
Name: WND, dtype: object

In [None]:
for i in range(1,3):
    print(i)



In [None]:
a = np.array([1,2,4])
b = np.array([0,1,3])
a*b

In [None]:
df2['Hour'] = pd.to_datetime(df2['Hour'])
df3['Hour'] = pd.to_datetime(df3['Hour'])

In [None]:
(df2['Hour'].iloc[0] - df2['Hour'].iloc[1]).seconds / 3600

In [None]:
center = pd.Timestamp('2022-03-13 02:53:00')
argcenter = np.min([(center - time) for time in df2['Hour']])



In [None]:
padded_words = [None, *words, None]
for prev, current, nxt in zip(padded_words, padded_words[1:], padded_words[2:]):
    print(prev, current, nxt)

In [None]:
padded_words