In [1]:
import requests
import pandas as pd
import json
import datetime
import numpy as np

# for position API
import http.client, urllib.parse

In [2]:
# this function gets lattitude and longitude for a given city and country
# from positionstack.com
# variables could use renaming: country_input means country, query_input means city

def get_latt_long(country_input, query_input):
    conn = http.client.HTTPConnection('api.positionstack.com')
    params = urllib.parse.urlencode({
        'access_key': '1fbf7de4ce05274e9b9005659970c429',
        # Variables get live user input above.
        'country': country_input,
        #'region': region_input,
        'query': query_input,
        # "limit" of 1 ensures only 1 Lat/Lon will be passed below. 
        'limit': 1
    })
    conn.request('GET', '/v1/forward?{}'.format(params))
    res = conn.getresponse()
    geo = res.read()
    geo_str = (geo.decode('utf-8'))
    # Convert string to json (which is a dictionary).
    geo_json = json.loads(geo_str)
    # extract lattitude
    geo_json_lat = geo_json['data']
    var_lat = []
    for x in geo_json_lat:
        var_lat.append((x['latitude']))
    # extract longitude
    geo_json_lon = geo_json['data']
    var_lon = []
    for x in geo_json_lon:
        var_lon.append((x['longitude']))
    return (var_lat[0], var_lon[0])


In [3]:
# testing: this should return (35.695126, 139.75391)
# get_latt_long('jp', 'tokyo')

In [4]:
# this function builds the url for the API call to retrieve HOURLY weather data 
# from https://archive-api.open-meteo.com/v1/era5
# for given lattitue/longitude coordinates, for the given years;

def get_hourly_weather_url(lattitude, longitude, start_year, end_year):
    url_snip_hr = {}
    # Break URL into snippets to be assembled below, hourly:
    url_snip_hr[1] = "https://archive-api.open-meteo.com/v1/era5?latitude=" 
    url_snip_hr[2]= str(lattitude)
    url_snip_hr[3] = "&longitude="
    url_snip_hr[4] = str(longitude)
    url_snip_hr[5] = "&start_date="
    url_snip_hr[6] = str(start_year)
    url_snip_hr[7] = "-01-01&end_date="  # month and day of start_year
    url_snip_hr[8] = str(end_year)
    url_snip_hr[9] = "-12-31&hourly="   # elements, hourly
    url_snip_hr[10] = "temperature_2m,"   # data element
    url_snip_hr[11] = "relativehumidity_2m,"   # data element
    url_snip_hr[12] = "rain,"   # data element
    url_snip_hr[13] = "snowfall,"   # data element
    url_snip_hr[14] = "cloudcover,"   # data element
    url_snip_hr[15] = "windspeed_10m,"   # data element
    url_snip_hr[16] = "winddirection_10m"   # data element
    url_snip_hr[17] = "&timezone=America%2FLos_Angeles"   # time zone
    url_snip_hr[18] = "&temperature_unit=fahrenheit"   # temp unit
    url_snip_hr[19] = "&windspeed_unit=mph"   # windspeed unit
    url_snip_hr[20] = "&precipitation_unit=inch"   # precip unit
    weather_url_hr = ""
    for i in range(20):
        weather_url_hr += url_snip_hr[i+1]
    return weather_url_hr   

In [5]:
# this function builds the url for the API call to retrieve DAILY weather data 
# from https://archive-api.open-meteo.com/v1/era5
# for given lattitue/longitude coordinates, for the given years;

def get_daily_weather_url(lattitude, longitude, start_year, end_year):
    url_snip_dy = {}
    # Break URL into snippets to be assembled below, hourly:
    url_snip_dy[1] = "https://archive-api.open-meteo.com/v1/era5?latitude=" 
    url_snip_dy[2] = str(lattitude)   # latitude, input from previous json
    url_snip_dy[3] = "&longitude="
    url_snip_dy[4] = str(longitude)   # longitude, input from previous json
    url_snip_dy[5] = "&start_date="
    url_snip_dy[6] = str(start_year)
    url_snip_dy[7] = "-01-01&end_date="  # month and day of start_year
    url_snip_dy[8] = str(end_year)
    url_snip_dy[9] = "-12-31&daily="   # elements, daily
    url_snip_dy[10] = "temperature_2m_max,"   # data element
    url_snip_dy[11] = "temperature_2m_min,"   # data element
    url_snip_dy[12] = "rain_sum,"   # data element
    url_snip_dy[13] = "snowfall_sum,"   # data element
    url_snip_dy[14] = "precipitation_hours"   # data element
    url_snip_dy[15] = "&timezone=America%2FLos_Angeles"   # time zone
    url_snip_dy[16] = "&temperature_unit=fahrenheit"   # temp unit
    url_snip_dy[17] = "&windspeed_unit=mph"   # windspeed unit
    url_snip_dy[18] = "&precipitation_unit=inch"   # precip unit
    weather_url_dy = ""
    for i in range(18):
        weather_url_dy += url_snip_dy[i+1]
    return weather_url_dy

In [6]:
# daily_or_hourly should be a string
#  ### needs better error-handling ###

def get_weather_url(lattitude, longitude, start_year, end_year, daily_or_hourly):
    if daily_or_hourly == 'daily':
        return get_daily_weather_url(lattitude, longitude, start_year, end_year)
    elif daily_or_hourly== 'hourly':
        return get_hourly_weather_url(lattitude, longitude, start_year, end_year)
    else:
        return 'error'

In [7]:
# # testing: 'print' makes them clickable!! so go look at the data :)

# print( get_weather_url(35.695126, 139.75391, 2010, 2019,'daily') )
# print( get_weather_url(35.695126, 139.75391, 2010, 2019,'hourly') )

In [8]:
# this function retrieves weather data for given lattitue/longitude coordinates
# from https://archive-api.open-meteo.com/v1/era5
# for the given years; 
# e.g. if start_year=2010 and end_year=2020, 11 years of data are retrieved,
# starting 2010-01-01 and ending 2020-12-31, inclusive
#   ### valid years ###
# again, daily_or_hourly must be one of the two strings

def get_weather(lattitude, longitude, start_year, end_year, daily_or_hourly):
    url = get_weather_url(lattitude, longitude, start_year, end_year, daily_or_hourly)
    # Data comes in as one long string:
    weather_hr_str = requests.get(url).text
    # So convert string to dictionary.
    weather_hr_json = json.loads(weather_hr_str)
    # now turn dictionary into dataframe
    weather_raw = pd.DataFrame.from_records(weather_hr_json[daily_or_hourly])
    # convert the provided ISO string 'time' into a 'pure_date' in python datetime format
    # for aggregating and joining with daily data
    weather_raw["pure_date"] = weather_raw['time'].map(lambda x: 
                                                        datetime.datetime.fromisoformat(x[0:10]))
    # forget the string with the hour
    weather_raw.drop('time', axis=1, inplace=True)
    return weather_raw

In [9]:
# testing this should complete within a second or five, without errors
#  ### testing best practices: how do i display expected output?? 
#  ### which is a better test: test_df.info() or test_df.describe()?

daily_test_df = get_weather(35.695126, 139.75391, 2010, 2019, 'daily')
daily_test_df.info()

hourly_test_df = get_weather(35.695126, 139.75391, 2010, 2019, 'hourly')
hourly_test_df.info()

# note that wind_direction in hourly is null when windspeed is 0
# we do nothing with wind direction, so don't bother fixing it

# gaierror means check your internet connection

# these dataframes are used in later testing!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3652 entries, 0 to 3651
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   precipitation_hours  3652 non-null   float64       
 1   rain_sum             3652 non-null   float64       
 2   snowfall_sum         3652 non-null   float64       
 3   temperature_2m_max   3652 non-null   float64       
 4   temperature_2m_min   3652 non-null   float64       
 5   pure_date            3652 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(5)
memory usage: 171.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87648 entries, 0 to 87647
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   cloudcover           87648 non-null  int64         
 1   rain                 87648 non-null  float64       
 2   relativehumidity_2m  87648 non-null 

In [10]:
# this auxillary function will be used to aggreagte hourly data into daily
# for example, getting the temperature of 6th hottest hour answers questions like:
# Were there at least 6 hours above 80F? and Were there at least 18 hours below 32F?

def enth(x, n):
    return x.sort_values().iloc[n]

In [11]:
# this function takes in the hourly and daily weather dataframes made by get_weather
#  ### behaviour if different time periods ###
# the returned dataframe is indexed by pure_date, so same number of rows as daily input
# the returned dataframe has all the columns of the daily input,
# plus a bunch of aggregates of data from hourly
# yes, max_wind = wind_high and temp_high is also a rename; someday, we fix this inefficiency

def agg_hourly_and_daily(hourly_df, daily_df):
    ## BEWARE arguments passed by reference, don't mess with them ! ##
    output_df = hourly_df.groupby('pure_date').agg(
    # historical statistics (adjectives): humid_avg, wind_high, cloud_avg, temp_6
        humid_avg=('relativehumidity_2m', np.mean),
        wind_high = ('windspeed_10m', np.max),
        cloud_avg=('cloudcover', np.mean),
        temp_6= ('temperature_2m', lambda x: enth(x,18)),
    # machine learning: avg_humidity, median_wind, max_wind, cloud_4, cloud_12, cloud_20
        avg_humidity=('relativehumidity_2m', np.mean),
        median_wind = ('windspeed_10m', np.median),
        max_wind = ('windspeed_10m', np.max),
        cloud_4 = ('cloudcover', lambda x: enth(x,3)),
        cloud_12 = ('cloudcover', lambda x: enth(x,11)),
        cloud_20 = ('cloudcover', lambda x: enth(x,19)) )
#     output_df.set_index('pure_date', inplace=True)
    output_df = output_df.join(daily_df.set_index('pure_date'))
    output_df['temp_high'] = output_df['temperature_2m_max']
    return output_df

In [12]:
# testing, using the hourly_test_df from previous test
#  ### again, how to display expected output? ###

clean_df = agg_hourly_and_daily(hourly_test_df, daily_test_df)
clean_df.describe()

Unnamed: 0,humid_avg,wind_high,cloud_avg,temp_6,avg_humidity,median_wind,max_wind,cloud_4,cloud_12,cloud_20,precipitation_hours,rain_sum,snowfall_sum,temperature_2m_max,temperature_2m_min,temp_high
count,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0
mean,71.337692,13.292333,45.558621,64.94299,71.337692,7.172193,13.292333,24.085706,44.033954,64.3954,4.058598,0.153442,0.054493,68.38023,55.000821,68.38023
std,11.054197,5.407639,29.659068,14.746371,11.054197,3.363566,5.407639,29.077274,35.016543,34.064647,5.922985,0.398454,0.707603,14.402073,15.819658,14.402073
min,29.208333,4.0,0.0,33.7,29.208333,1.8,4.0,0.0,0.0,0.0,0.0,0.0,0.0,36.2,17.5,36.2
25%,64.541667,9.4,19.697917,51.2,64.541667,4.9,9.4,1.0,13.0,32.0,0.0,0.0,0.0,55.4,40.4,55.4
50%,73.25,12.1,42.166667,65.6,73.25,6.3,12.1,13.0,35.0,71.0,1.0,0.0,0.0,69.0,55.9,69.0
75%,79.625,16.0,69.625,76.9,79.625,8.4625,16.0,35.0,77.0,100.0,7.0,0.11,0.0,80.1,68.7,80.1
max,96.583333,52.3,100.0,97.4,96.583333,25.8,52.3,100.0,100.0,100.0,24.0,7.7,23.94,102.5,84.3,102.5


## Getting adjectives for historical summary

Now we 
 - filter by user-requested dates;
 - get boolean columns for adjectives, and 
 - compute rates/likelihoods for each adjective.