In [1]:
from dotenv import load_dotenv
import pandas as pd
import polars as pl
import multiextractor
from datetime import datetime
import numpy as np
import json
import io
import requests
from urllib.parse import urlparse
import os
import spacy
from test_extract_rss import create_entry_from_rss
from google.cloud import bigquery

load_dotenv()

nlp = spacy.load('en_core_web_md')
key = os.environ['OPEN_WEATHER_KEY']
project = os.environ['GCLOUD_PROJECT']
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.environ['GCLOUD_SERVICE_KEY_PATH']

In [2]:
geo_url = 'http://api.openweathermap.org/geo/1.0/direct?q={city}&limit={limit}&appid={key}'

payload={}
files={}
headers = {}
city = 'Hong Kong'
country = 'HK'

response = requests.request("GET", geo_url.format(city=city, limit=5, key=key), headers=headers, data=payload, files=files)

print(response.text)


[{"name":"Hong Kong Island","local_names":{"no":"Hongkongøya","ur":"جزیرہ ہانگ کانگ","cs":"Hongkong","ca":"Illa de Hong Kong","nl":"Hongkong","ja":"香港島","sv":"Hongkongön","it":"Hong Kong","bg":"Хонконг","gl":"Illa de Hong Kong","be":"Ганконг","my":"ဟောင်ကောင်ကျွန်း","ar":"جزيرة هونغ كونغ","tr":"Hong Kong Adası","el":"Χονγκ Κονγκ","he":"הונג קונג","et":"Hongkongi saar","eu":"Hong Kong uhartea","vi":"Đảo Hồng Kông","zh":"香港島","uk":"Гонконг","fa":"جزیره هنگ کنگ","tl":"Pulo ng Hong Kong","pt":"Ilha de Hong Kong","mk":"Хонгконг","ms":"Pulau Hong Kong","de":"Hong Kong Island","es":"Isla de Hong Kong","sr":"Хонгконг","id":"Pulau Hong Kong","hu":"Hongkong-sziget","en":"Hong Kong Island","lt":"Honkongas","pl":"Hongkong","oc":"Hongkong","ka":"ჰონგ-კონგი","br":"Enez Hong Kong","fr":"Île de Hong Kong","ru":"Гонконг","kn":"ಹಾಂಗ್ಕಾಂಗ್","lv":"Honkonga","ko":"홍콩섬","ta":"ஒங்கொங் தீவு","hi":"हाँगकाँग द्वीप","bn":"হংকং দ্বীপ","eo":"Honkonga Insulo","ku":"Hong Kong","ml":"ഹോങ്കോങ്","la":"Insula Hongcongen

In [3]:
r = json.loads(response.text)
loc_data = {
    'name': r[0]['name'],
    'local_names': {
        'en': r[0]['local_names']['en'], 
        'zh': r[0]['local_names']['zh']
    },
    'lat': r[0]['lat'],
    'lon': r[0]['lon'],
    'country': r[0]['country'],
    'state': r[0]['state']
}

In [4]:
current_url = 'https://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={key}&units=metric'

payload={}
files={}
headers = {}
lat = loc_data['lat']
lon = loc_data['lon']

response2 = requests.request("GET", current_url.format(lat=lat, lon=lon, key=key), headers=headers, data=payload, files=files)

print(response2.text)


{"coord":{"lon":114.1583,"lat":22.281},"weather":[{"id":802,"main":"Clouds","description":"scattered clouds","icon":"03d"}],"base":"stations","main":{"temp":21.18,"feels_like":20.79,"temp_min":20.42,"temp_max":23.22,"pressure":1022,"humidity":55},"visibility":10000,"wind":{"speed":3.58,"deg":310},"clouds":{"all":49},"dt":1703658084,"sys":{"type":2,"id":2035800,"country":"HK","sunrise":1703631645,"sunset":1703670430},"timezone":28800,"id":8223932,"name":"City of Victoria","cod":200}


In [5]:
class WeatherBuilder:
    _icon_url_template = 'https://openweathermap.org/img/wn/{icon}@2x.png'
    
    def __init__(self, resp_data):
        self._tz_shift = resp_data.get('timezone', None)
        self._s_data = self.__build_sys(resp_data)
        self.proc_data = self.__process_records(resp_data)
        
    def __build_record(self, record):
        return {
            'main': self.__build_main(record['main']),
            # 'coord': self.__build_coords(record.get('coords', None)),
            'visibility_metres': record['visibility'],
            'weather': self.__build_weather(record['weather']),
            'wind': self.__build_wind(record['wind']),
            'cloudiness_perc': self.__build_clouds(record['clouds']),
            'wind': self.__build_snow_rain(record.get('wind', None)),
            'rain': self.__build_snow_rain(record.get('rain', None)),
            'sys': self._s_data,
            'date': self.__build_date(record, 'dt'),
            'forecasted_date': self.__build_date(record, 'dt_txt'),
            'timezone_shift_s': self._tz_shift
        }
    
    @staticmethod
    def __build_coords(vals):
        return {
            'lat': vals['lat'] if vals is not None else None,
            'lon': vals['lon'] if vals is not None else None
        }
        
    @staticmethod
    def __build_main(vals):
        return {
          'temp_c': vals['temp'],
          'feels_like_c': vals['feels_like'],
          'temp_min_c': vals['temp_min'],
          'temp_max_c': vals['temp_max'],
          'pressure_hpa': vals['pressure'],
          'humidity_perc': vals['humidity'],
          'sea_level_hpa': vals.get('sea_level', None),
          'grnd_level_hpa': vals.get('sea_level', None)
        }
    
    @classmethod
    def __build_weather(cls, vals):
        weather_data = []
        for wd in vals:
            tmp_wd = {
                'main': wd['main'],
                'description': wd['description'],
                'icon': cls._icon_url_template.format(icon=wd['icon'])
            }
            weather_data.append(tmp_wd)
        return weather_data
    
    @staticmethod
    def __build_clouds(vals):
        return vals['all']
    
    @staticmethod
    def __build_wind(vals):
        return {
            'speed_m_s': vals['speed'],
            'deg': vals['deg'],
            'gust_m_s': vals.get('gust', None)
        }
    
    @staticmethod
    def __build_snow_rain(vals):
        return {
            '1h_mm': vals.get('1h', None) if vals is not None else None, 
            '3h_mm': vals.get('3h', None) if vals is not None else None
        }
    
    @staticmethod
    def __build_date(vals, key):
        date_val = vals.get(key, None)
        match date_val:
            case int():
                return datetime.fromtimestamp(date_val)
            case str():
                return datetime.strptime(date_val, '%Y-%m-%d %H:%M:%S')
            case _:
                return date_val
    
    @classmethod
    def __build_sys(cls, vals):
        if vals.get('sys', None) is None:
            tmp_vals = vals['city']
        else:
            tmp_vals = vals['sys']

        return {
            'country': tmp_vals.get('country', None),
            'sunrise': cls.__build_date(tmp_vals, 'sunrise'),
            'sunset': cls.__build_date(tmp_vals, 'sunset'),
            'part_of_day': tmp_vals.get('pod', None),
            'coord': cls.__build_coords(tmp_vals.get('coords', None))
        }
    
    def __process_records(self, resp):
        tmp = resp.get('list', None)
        if tmp is not None:
            return [self.__build_record(data) for data in tmp]
        else:
            return self.__build_record(resp)

In [6]:
r = json.loads(response2.text)
icon_url_template = 'https://openweathermap.org/img/wn/{icon}@2x.png'

c = WeatherBuilder(json.loads(response2.text))
# c.proc_data

# weather_data = []
# for wd in r['weather']:
#     tmp_wd = {
#         'main': wd['main'],
#         'description': wd['description'],
#         'icon': icon_url_template.format(icon=wd['icon'])
#     }
#     weather_data.append(wd)

# curr_data = {
#     'coord': r['coord'],
#     'weather': [weather_data],
#     'main': {
#         'temp_c': r['main']['temp'],
#         'feels_like_c': r['main']['feels_like'],
#         'temp_min_c': r['main']['temp_min'],
#         'temp_max_c': r['main']['temp_max'],
#         'pressure_hpa': r['main']['pressure'],
#         'humidity_perc': r['main']['humidity'],
#         'sea_level_hpa': r['main'].get('sea_level', None),
#         'grnd_level_hpa': r['main'].get('sea_level', None)
#     },
#     'visibility_metres': r['visibility'],
#     'wind': {
#         'speed_m_s': r['wind']['speed'],
#         'deg': r['wind']['deg'],
#         'gust_m_s': r['wind'].get('gust', None)
#     },
#     'cloudiness_perc': r['clouds']['all'],
#     'rain': {
#         '1h_mm': r['rain'].get('1h', None), 
#         '3h_mm': r['rain'].get('3h', None)
#     },
#     'snow': {
#         '1h_mm': r['snow'].get('1h', None), 
#         '3h_mm': r['snow'].get('3h', None)  
#     },
#     'dt': datetime.fromtimestamp(r['dt']),
#     'sys': {
#         'country': r['sys']['country'],
#         'sunrise': datetime.fromtimestamp(r['sys']['sunrise']),
#         'sunset': datetime.fromtimestamp(r['sys']['sunset'])
#     },
#     'timezone_shift_s': r['timezone']
# }

In [7]:
five_day_3h_forecast_url = 'https://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&appid={key}&units=metric'

payload={}
files={}
headers = {}
lat = loc_data['lat']
lon = loc_data['lon']

response3 = requests.request("GET", five_day_3h_forecast_url.format(lat=lat, lon=lon, key=key), headers=headers, data=payload, files=files)

print(response3.text)


{"cod":"200","message":0,"cnt":40,"list":[{"dt":1703667600,"main":{"temp":20.64,"feels_like":20.12,"temp_min":19.58,"temp_max":20.64,"pressure":1022,"sea_level":1022,"grnd_level":1015,"humidity":52,"temp_kf":1.06},"weather":[{"id":802,"main":"Clouds","description":"scattered clouds","icon":"03d"}],"clouds":{"all":46},"wind":{"speed":2.59,"deg":109,"gust":2.66},"visibility":10000,"pop":0,"sys":{"pod":"d"},"dt_txt":"2023-12-27 09:00:00"},{"dt":1703678400,"main":{"temp":19.97,"feels_like":19.51,"temp_min":19.37,"temp_max":19.97,"pressure":1023,"sea_level":1023,"grnd_level":1015,"humidity":57,"temp_kf":0.6},"weather":[{"id":803,"main":"Clouds","description":"broken clouds","icon":"04n"}],"clouds":{"all":62},"wind":{"speed":3.42,"deg":89,"gust":3.76},"visibility":10000,"pop":0,"sys":{"pod":"n"},"dt_txt":"2023-12-27 12:00:00"},{"dt":1703689200,"main":{"temp":19.35,"feels_like":19.06,"temp_min":19.35,"temp_max":19.35,"pressure":1023,"sea_level":1023,"grnd_level":1015,"humidity":66,"temp_kf":0

In [8]:
r = json.loads(response3.text)
d = WeatherBuilder(r)

In [9]:
tmp = pd.DataFrame.from_records(d.proc_data)
# tmp['main'].apply(lambda x: pd.Series())

tmp2 = pl.DataFrame(tmp)
tmp2 = (
    tmp2
    .unnest('sys')
    .unnest('main')
    .drop('weather')
    .with_columns(
        pl.col('part_of_day').cast(pl.Utf8, strict=False).alias('part_of_day'),
        pl.col('sunrise').cast(pl.Datetime, strict=False).alias('sunrise'),
        pl.col('sunrise').dt.date().alias('sunrise_date'),
        pl.col('sunrise').dt.time().alias('sunrise_time'),
        pl.col('sunset').dt.date().alias('sunset_date'),
        pl.col('sunset').dt.time().alias('sunset_time'),
        pl.col('date').dt.date().alias('measure_date'),
        pl.col('date').dt.time().alias('measure_time'),
        pl.col('forecasted_date').dt.date().alias('forecasted_date'),
        pl.col('forecasted_date').dt.time().alias('forecasted_time')                      
    )
    .drop(['sunrise', 'sunset', 'date'])    
)

tmp3 = pl.DataFrame(tmp[['forecasted_date', 'weather']])
tmp3 = (
    tmp3
    .explode('weather')
    .unnest('weather')
)

In [10]:
tmp4 = pl.DataFrame([c.proc_data])
tmp4 = (
    tmp4
    .unnest('main')
    .unnest('sys')
    .with_columns(
        pl.col('sunrise').dt.date().alias('sunrise_date'),
        pl.col('sunrise').dt.time().alias('sunrise_time'),
        pl.col('sunset').dt.date().alias('sunset_date'),
        pl.col('sunset').dt.time().alias('sunset_time'),
        pl.col('date').dt.date().alias('measure_date'),
        pl.col('date').dt.time().alias('measure_time')                     
    )
    .drop(['sunrise', 'sunset', 'date'])    
)

tmp5 = tmp4.select([pl.col('forecasted_date'), pl.col('weather')])
tmp5 = (
    tmp5
    .explode('weather')
    .unnest('weather')
)

In [11]:
def get_bq_type(col_name, 
                data_type, 
                main_col_name,
                override_table_schema,
                override_col_list):
    bq_mode = 'NULLABLE'
    bq_type = None
               
    match data_type:
        case pl.Float64:
            bq_type = 'FLOAT64'
        case pl.Int32 | pl.Int64:
            bq_type = 'INT64'
        case pl.Utf8:
            bq_type = 'STRING'
        case pl.Boolean:
            bq_type = 'BOOL'
        case pl.Datetime:
            bq_type = 'DATETIME'
            # bq_type = 'TIMESTAMP'
            bq_mode = 'REQUIRED'
        case pl.Date:
            bq_type = 'DATE'
            bq_mode = 'REQUIRED'
        case pl.Time:
            bq_type = 'TIME'
            bq_mode = 'REQUIRED'
        case _:
            pass
            # match col_name:
                # case '1h_mm' | '3h_mm' | 'lat' | 'lon':
                #     bq_type = 'FLOAT64'
                    # if override_col_list is not None:
                    #     if main_col_name in override_col_list:
                    #         match main_col_name:
                    #             case 'rain':
                    #                 bq_type = 'INT64'
                    #             case 'feels_like_c':
                    #                 bq_type = 'FLOAT64'
                    #             case _:
                    #                 pass
                # case 'part_of_day':
                #     bq_type = 'STRING'
                # case _:
                #     raise Exception('Error processing BQ data type')
                
    if override_col_list is not None:
        if main_col_name in override_col_list:
            match col_name:
                case 'feels_like_c' :
                    bq_type = 'FLOAT64'
                case '1h_mm' | '3h_mm' | 'lat' | 'lon':
                    if override_table_schema == 'projection':
                        bq_type = 'FLOAT64'
                    elif override_table_schema == 'current':
                        bq_type = 'INT64'
                case 'part_of_day':
                    bq_type = 'STRING'
                case _:
                    raise Exception('Error processing BQ data type')
    return bq_type, bq_mode
            
def create_bq_schema(schema, override_table_schema=None, override_col_list=None):
    _bq_schema_list = []
    for col_name, data_type in schema.items():
        if isinstance(data_type, (pl.Struct, pl.List)):
            if isinstance(data_type, pl.List):
                data_type_struct = data_type.inner
                bq_mode = 'REPEATED'
            else:
                data_type_struct = data_type
                bq_mode = 'NULLABLE'
                
            _tmp_struct_list = []
            for col_name2, data_type2 in data_type_struct.to_schema().items():
                bq_type2, _ = get_bq_type(col_name2, 
                                          data_type2,
                                          col_name,
                                          override_table_schema,
                                          override_col_list)
                if col_name2 in ['1h_mm', '3h_mm']: col_name2 = '_'.join([col_name2.split('_')[-1], col_name2.split('_')[0]])
                _tmp_struct_list.append(bigquery.SchemaField(col_name2, bq_type2))
                
            bq_type = 'RECORD'  
        else:
            bq_type, bq_mode = get_bq_type(col_name, 
                                           data_type, 
                                           col_name,
                                           override_table_schema,
                                           override_col_list)
            
        if isinstance(data_type, pl.Struct):
            _field = bigquery.SchemaField(col_name, bq_type, mode=bq_mode, fields=tuple(_tmp_struct_list))
        elif isinstance(data_type, pl.List):
            if isinstance(data_type.inner, pl.Struct):
                _field = bigquery.SchemaField(col_name, bq_type, mode=bq_mode, fields=tuple(_tmp_struct_list))
            else:
                _field = bigquery.SchemaField(col_name, bq_type, mode=bq_mode)
        else:
            _field = bigquery.SchemaField(col_name, bq_type, mode=bq_mode)
        _bq_schema_list.append(_field)
    return _bq_schema_list

def create_load_bq_dataset(client, dataset_name):
    dataset_id = f'{client.project}.{dataset_name}'
    dataset = bigquery.Dataset(dataset_id)
    dataset = client.create_dataset(dataset, timeout=30, exists_ok=True)
    print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
    return dataset
    
def create_load_bq_table(client, dataset, table_name, table_schema):
    table_id = f'{client.project}.{dataset.dataset_id}.{table_name}'
    table = bigquery.Table(table_id, schema=table_schema)
    table = client.create_table(table, timeout=30, exists_ok=True)
    print('Created table {}.{}.{}'.format(table.project, table.dataset_id, table.table_id))
    return table

def load_table_to_gcp(client, data, dataset_name, table_name, table_schema, src_format='polars'):
    dataset = create_load_bq_dataset(client, dataset_name)
    table = create_load_bq_table(client, dataset, table_name, table_schema)
    
    match src_format:
        case 'polars':
            with io.BytesIO() as stream:
                data.write_parquet(stream)
                stream.seek(0)
                job = client.load_table_from_file(
                    stream,
                    destination=table,
                    job_config=bigquery.LoadJobConfig(
                        source_format=bigquery.SourceFormat.PARQUET,
                        ignore_unknown_values=True,
                        schema=table_schema
                    ),
                )
            job.result()
        case 'pandas':
            job_config = bigquery.LoadJobConfig(schema=table_schema, write_disposition="WRITE_APPEND")
            job = client.load_table_from_dataframe(data.to_pandas(), table, job_config=job_config)
            job.result()            
        case _:
            raise Exception('Improper option')
        
    print(f'Data loaded to {table.project}.{table.dataset_id}.{table.table_id}!')

In [12]:
schema_projected_tmp2 = create_bq_schema(tmp2.schema, override_col_list=['part_of_day', 'rain', 'wind', 'coord'], override_table_schema='projection')
schema_projected_tmp3 = create_bq_schema(tmp3.schema)
schema_projected_tmp4 = create_bq_schema(tmp4.schema, override_col_list=['feels_like_c', 'rain', 'wind', 'coord'], override_table_schema='current')
schema_projected_tmp5 = create_bq_schema(tmp5.schema)

In [13]:
client = bigquery.Client()

load_table_to_gcp(client, tmp2, 'OPEN_WEATHER', 'OW_PROJECTION_WEATHER', schema_projected_tmp2, src_format='pandas')
load_table_to_gcp(client, tmp3, 'OPEN_WEATHER', 'OW_PROJECTION_WEATHER_URL', schema_projected_tmp3, src_format='pandas')
load_table_to_gcp(client, tmp4, 'OPEN_WEATHER', 'OW_CURRENT_WEATHER', schema_projected_tmp4, src_format='pandas')
load_table_to_gcp(client, tmp5, 'OPEN_WEATHER', 'OW_CURRENT_WEATHER_URL', schema_projected_tmp5, src_format='pandas')

Created dataset ordinal-stone-402505.OPEN_WEATHER
Created table ordinal-stone-402505.OPEN_WEATHER.OW_PROJECTION_WEATHER
Data loaded to ordinal-stone-402505.OPEN_WEATHER.OW_PROJECTION_WEATHER!
Created dataset ordinal-stone-402505.OPEN_WEATHER
Created table ordinal-stone-402505.OPEN_WEATHER.OW_PROJECTION_WEATHER_URL
Data loaded to ordinal-stone-402505.OPEN_WEATHER.OW_PROJECTION_WEATHER_URL!
Created dataset ordinal-stone-402505.OPEN_WEATHER
Created table ordinal-stone-402505.OPEN_WEATHER.OW_CURRENT_WEATHER
Data loaded to ordinal-stone-402505.OPEN_WEATHER.OW_CURRENT_WEATHER!
Created dataset ordinal-stone-402505.OPEN_WEATHER
Created table ordinal-stone-402505.OPEN_WEATHER.OW_CURRENT_WEATHER_URL
Data loaded to ordinal-stone-402505.OPEN_WEATHER.OW_CURRENT_WEATHER_URL!
