In [1]:
import numpy as np
import pandas as pd
import pickle


In [2]:
# Load prefecture name data
prefecture_names = pd.read_csv("data/Japan_prefecture_name.csv")
# We need the second row only
prefecture_names = prefecture_names.iloc[1]

In [None]:
# Load air pollution data of each prefecture
features = ['date','pm10','pm25','so2','no2','o38h']
df = { prefecture_names['V{}'.format(i)] : pd.read_csv("data/Japan_jap1219_{}.csv".format(i))[features] for i in range(1, 48, 1) }
# Test output
df['Yokohama'].head(3)

In [None]:
nans = {}# Find NaN values
for prefecture in df:
    num_nan = df[prefecture].isna().sum().sum()
    nans[prefecture] = num_nan

# sort by number of NaN values
sorted_nans = sorted(nans.items(), key=lambda x: x[1], reverse=True)
# Show top 10 prefectures with most NaN values
sorted_nans[:10]

In [None]:
# Display some examples
display(df['Miyazaki'].head(5))
display(df['Naha'].head(5))
display(df['Aomori'].head(5))
display(df['Utsunomiya'].head(5))

In [None]:
# Remove first N rows with NaN values
first_row_index = {}
for prefecture in df:
    # Get the first row index without NaN values
    first_row_index[prefecture] = df[prefecture].dropna().index[0]

# Get the maximum index across all prefectures
max_index = max(first_row_index.values())

# Remove rows with index smaller than max_index
for prefecture in df:
    df[prefecture] = df[prefecture].iloc[max_index:]

# Check if there are still NaN values
nans = {}# Find NaN values
for prefecture in df:
    num_nan = df[prefecture].isna().sum().sum()
    nans[prefecture] = num_nan

# sort by number of NaN values
sorted_nans = sorted(nans.items(), key=lambda x: x[1], reverse=True)
# Show top 10 prefectures with most NaN values
sorted_nans[:10] 


In [None]:
df['Naha'][df['Naha'].isna().any(axis=1)]

In [8]:
# Interpolate NaN values
for prefecture in df:  
    # Set the date as DatetimeIndex
    df[prefecture]['date'] = pd.to_datetime(df[prefecture]['date'])
    df[prefecture] = df[prefecture].set_index('date')
    # Interpolate NaN values
    df[prefecture] = df[prefecture].interpolate(method='time')

# Check if there are still NaN values
nans = {}# Find NaN values
for prefecture in df:
    num_nan = df[prefecture].isna().sum().sum()
    if num_nan > 0:
        print(prefecture, num_nan)


In [9]:
# Save the cleaned data as pickle file
with open('output/cleaned_data.pkl', 'wb') as f:
    pickle.dump(df, f)