## Getting Daily Weather Forcast from Jeju Datahub
1. Daily weather forecast starting from 2014/01/01 ~ 2024/06/30
  1. Python -> request and date
  ``https://open.jejudatahub.net/api/proxy/1aD5taat1attaa51Db1511b51ab9Da19/{your_projectKey}?{params(key=value)}``
2. Store the data into `jejuIsland.csv`
3. Preprocess the data based on `observatoryName` and create a csv file based on the name.
  1. Each csv file should have 10 years of daily weather forecast of `observatoryName`
  2. Then replace missing values with neighbor values



## Installation, setting and import

### Installation & Setting
- Google Drive directory setting

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os

if (not os.path.exists('/content/drive/MyDrive/CSVs')):
  os.makedirs('/content/drive/MyDrive/CSVs')

Mounted at /content/drive


### import
- requests
- datetime
- time
- google drive
- shutil

In [2]:
# requests
import requests
# datetime
from datetime import datetime, timedelta
# time - pause crawling for 0.5 sec
import time
# Userdata for secret key
from google.colab import userdata
# pandas
import pandas as pd
# moving files between dirs
import shutil
# numpy
import numpy as np

## Preprocessing functions
- `dates_list_generation()`
- `json_parsing_200()`
- `duplicate_eraser()`
- `missing_value_handling()`

In [3]:
## dates_list_generation
def dates_list_generation(start_date, end_date):
  dates = []
  current = start_date
  while current <= end_date:
    dates.append(current.strftime('%Y%m%d'))
    current += timedelta(days=1)
  return dates

In [4]:
## Json -> CSV
def json_parsing_200(data, file_path):
  for i, record in enumerate(data['data']):
    ob_name = record.get('observatoryName')
    processed_record = {
        'baseDate': record.get('baseDate'),
        'averageTemperature': record.get('averageTemperature'),
        'lowestTemperature': record.get('lowestTemperature'),
        'lowestTemperatureTime': record.get('lowestTemperatureTime'),
        'highestTemperature': record.get('highestTemperature'),
        'highestTemperatureTime': record.get('highestTemperatureTime'),
        'dailyRainfall': record.get('dailyRainfall'),
        'maximumWindSpeed': record.get('maximumWindSpeed'),
        'maximumWindSpeedTime': record.get('maximumWindSpeedTime'),
        'averageWindSpeed': record.get('averageWindSpeed'),
        'maximumWindSpeedDirection': record.get('maximumWindSpeedDirection')
        }

    new_row = pd.DataFrame([record])

    # Google Drive folder
    # Jeju_Weahter/{observatoryName}.csv
    new_file_path = file_path + '/' + ob_name + '.csv'
    if os.path.exists(new_file_path):
      df = pd.read_csv(new_file_path)
      df = pd.concat([df, new_row], ignore_index= True)
      df.to_csv(new_file_path, index=False)
    else:
      new_row.to_csv(new_file_path, index=False)

In [60]:
def missing_row_500(baseDate, observatoryName):
  empty_row = {
      'observatoryName': observatoryName,
        'baseDate': int(baseDate),
        'averageTemperature': np.nan,
        'lowestTemperature': np.nan,
        'lowestTemperatureTime': np.nan,
        'highestTemperature': np.nan,
        'highestTemperatureTime': np.nan,
        'dailyRainfall': np.nan,
        'maximumWindSpeed': np.nan,
        'maximumWindSpeedTime': np.nan,
        'averageWindSpeed': np.nan,
        'maximumWindSpeedDirection': np.nan
        }
  return pd.DataFrame([empty_row])

In [6]:
## duplicate_eraser()
def duplicate_eraser(df):
  df_no_duplicates = df.drop_duplicates(subset='baseDate')
  return df_no_duplicates

In [7]:
## missing value handling
def missing_value_handling(df):
  df = df.fillna(method='bfill')
  df = df.fillna(method='ffill')
  return df

## Execution
1. Generating dates: 10years
2. Sending requests & Changing format
  - response -> json
  - json -> csv
3. Saving into Google Drive
4. Remove duplicated values -> based on baseDate
5.  Data validity check
  - Data that's not been update for a year cannot be used due to accuracy
  - CSV file that has not been updated for 7 days can't be used
  - CSV file that has less than 1000 rows won't be used
    - It will be moved to not `Not_Valid` directory
6. Missing Value
  - Backward fill: `df.fillna(method='bfill')`
  - Forward fill: `df.fillna(method='ffill')`

In [None]:
# Step 1, 2, 3
start_date = datetime(2024,1,1)
end_date = datetime(2024,7,2)
dates = dates_list_generation(start_date, end_date)
file_path = '/content/drive/MyDrive/CSVs/JejuWeather'
if (not os.path.exists(file_path)):
  os.makedirs(file_path)
start_time = time.time()
for year_month_day in dates:
  project_key = userdata.get('jejuWeatherKey')
  url = f'https://open.jejudatahub.net/api/proxy/1aD5taat1attaa51Db1511b51ab9Da19/{project_key}?searchDate={year_month_day}'
  time.sleep(1)
  response = requests.get(url)
  if response.status_code == 200:
    try:
      print(f"Status 200 JSON for {year_month_day}")
      response_json = response.json()
      json_parsing_200(response_json, file_path)
    except ValueError as e:
      print(f"Error decoding JSON for {year_month_day}: {e}")
      print(response.text)
  else:
    print(f"Request failed for {year_month_day} with status code: {response.status_code}")
    print(response.text)

current_time = time.time()
print(f"Elapsed time: {current_time - start_time} seconds")

In [None]:
## Step 4 Erasing duplicated values
# getting list of csv files
directory_path = '/content/drive/MyDrive/CSVs/JejuWeather'

if (not os.path.exists(directory_path)):
  print('Directory not exist')
  exit()

csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

# # Test
# test_df = pd.read_csv(os.path.join(directory_path, '강정.csv'))
# print(test_df.size)
# test_df = duplicate_eraser(test_df)
# print(test_df.size)
for csv_file in csv_files:
  file_path = os.path.join(directory_path, csv_file)
  df = pd.read_csv(file_path)
  print(csv_file)
  print(df.size)
  no_dup_df = duplicate_eraser(df)
  print(no_dup_df.size)
  no_dup_df.to_csv(file_path, index=False)


In [15]:
# Step 5 Data validity check
csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]
not_valid_dir = '/content/drive/MyDrive/CSVs/JejuWeather/Not_Valid'
if (not os.path.exists(not_valid_dir)):
  os.makedirs(not_valid_dir)
for csv_file in csv_files:
  file_path = os.path.join(directory_path, csv_file)
  df = pd.read_csv(file_path)
  # last updated date check -> less then 7 days
  first_date_str = str(df['baseDate'].iloc[0])
  last_date_str = str(df['baseDate'].iloc[-1])
  first_date = datetime.strptime(first_date_str, "%Y%m%d")
  last_date = datetime.strptime(last_date_str,"%Y%m%d")
  days_passed = (datetime.now() - last_date).days
  total_days = (last_date - first_date).days
  # # of rows check -> more than 10000
  if (days_passed > 15 or total_days < 1000):
    print(f"{csv_file} is not valid")
    shutil.move(file_path, '/content/drive/MyDrive/CSVs/JejuWeather/Not_Valid/')

강정.csv is not valid
한림.csv is not valid


In [None]:
## Step 6 Missing Values & Rows
# Filling empty rows
directory_path = '/content/drive/MyDrive/CSVs/JejuWeather'
csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

# Getting the last date among csv files
last_date = int((datetime.now()-timedelta(days=7)).strftime('%Y%m%d'))
for csv_file in csv_files:
  file_path = os.path.join(directory_path, csv_file)
  df = pd.read_csv(file_path)
  current_last_date = df.tail(1)['baseDate'].values[0]
  if (last_date < current_last_date):
    last_date = current_last_date

# filling missing rows
for csv_file in csv_files:
  file_path = os.path.join(directory_path, csv_file)
  df = pd.read_csv(file_path)
  current_date_of_current = datetime.strptime(str(df.head(1)['baseDate'].values[0]), "%Y%m%d")
  last_date_of_current = datetime.strptime(str(last_date), "%Y%m%d")
  print(csv_file)
  counter = 0
  new_df_to_append = pd.DataFrame()
  observatoryName = csv_file.split('.')[0]
  while (current_date_of_current <= last_date_of_current):
    current_date_int = int(current_date_of_current.strftime('%Y%m%d'))
    if (df[df['baseDate'] == current_date_int].empty):
      new_df_to_append = pd.concat([new_df_to_append, missing_row_500(current_date_int, observatoryName)], ignore_index=True)
    current_date_of_current += timedelta(days=1)
  df = pd.concat([df,new_df_to_append], ignore_index=True)
  df = duplicate_eraser(df)
  df = df.sort_values(by='baseDate')
  df.to_csv(file_path, index=False)

# checking missing rows
print("\nMissing rows check")
for csv_file in csv_files:
  file_path = os.path.join(directory_path, csv_file)
  df = pd.read_csv(file_path)
  current_date_of_current = datetime.strptime(str(df.head(1)['baseDate'].values[0]), "%Y%m%d")
  last_date_of_current = datetime.strptime(str(df.head(1)['baseDate'].values[0]), "%Y%m%d")
  counter = 0
  while (current_date_of_current <= last_date_of_current):
    current_date_int = int(current_date_of_current.strftime('%Y%m%d'))
    if (df[df['baseDate'] == current_date_int].empty):
      counter+=1
    current_date_of_current += timedelta(days=1)
  print(f"{csv_file} has {counter} missing rows")

In [None]:
## Step 6 Missing Values & Rows
# Filling missing values
directory_path = '/content/drive/MyDrive/CSVs/JejuWeather'
csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

for csv_file in csv_files:
  file_path = os.path.join(directory_path, csv_file)
  df = pd.read_csv(file_path)
  # choosing linear interpolation
  print(csv_file)
  df_filled = df.interpolate(method='linear')
  print("Done")
  df_filled.to_csv(file_path, index=False)