In [1]:
from urllib import request
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
import math
import ssl

#avoid error from SSL Certificate
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
def get_historical(area, data_type):
    
    area_name = ""
    local_counter = 0
    d = data_type

    for year in np.arange(1976,2021):
        
        #get url with area ID and year ID
        url = "https://www.data.jma.go.jp/obd/stats/etrn/view/monthly_h1.php?prec_no="+ str(area) +"&block_no=00&year=" + str(year) + "&month=1&day=&view=p" + str(d)
        
        #collect html data from the url
        response = request.urlopen(url)
        soup = BeautifulSoup(response)
        
        #collect table data
        #if the url does not contain table data, skip to the next year
        try:
            table = soup.findAll('table')[4]
        
        except IndexError:
            year +=1
            continue
            
        #get area name
        header = soup.findAll("h3")


        for name in header:
            h = name.text
            area_name = re.split("\地方|\都|\府|\県",h)[0]

        #get indices (yyyy-mm-15 for 1974-2020)
        index_list = []
        names = table.find_all('a')


        for name in names:
            text = name.text
            index_list.append(text)

        index_list = [str(year) + "-" + i + "-" + "15" for i in index_list]
        index_num = len(index_list)

        #get columns (observation points)
        col_list = []
        names = table.find_all("th")
        
        for name in names:
            text = name.text
            col_list.append(text)
            
        #get rid of the first column
        col_list = col_list[1:]
        
        #get number of columns for the calculation afterwards
        col_num = len(col_list) + 1


        #get row data
        row_list = []
        values = table.find_all("td")


        for i in np.arange(index_num):
            row = values[1 + i*col_num : col_num + i*col_num]
            row = [x.text for x in row]
            row_list.append(row)
            
        #creat local_df as pd.DataFrame
        local_df = pd.DataFrame(row_list, index=index_list, columns=col_list)
        
        #choose the first column if there are name collisions for observation points
        local_df = local_df.loc[:,  local_df.columns.duplicated() == False]
        
        #copy local df to df if it's the first iteration
        if local_counter==0:
            df = local_df
            local_counter+=1
        else:
            
            #add rows with nan if observation points are newly added
            for col in local_df.columns:
                
                if col not in df.columns:
                    df[col] = np.nan           
            
            #concat df and loacal_df
            df = pd.concat([df, local_df], axis=0)
        
        year +=1
    
    if area==17:
        area_name = "ｵﾎｰﾂｸ"
        
    if area_name == "京":
        area_name = "京都"
        
    return df, area_name

In [3]:
def get_df(start=11, end=92, d=2):
    
    counter = 0

    for n in np.arange(start,end):
        
        try:
            l, a = get_historical(area=n, data_type=d)
            print("area number {x} at {y} was successfull in being downloaded.".format(x=n, y=a))
            
        except:
            print("area number {} was not avairable.".format(n))
            continue

        local_df = l.T
        local_df.index = pd.Series(local_df.index, name="point")
        local_df["area"] = a
        local_df.set_index(["area"],append=True,inplace=True)
        local_df = local_df.swaplevel("area","point")
                
        if counter == 0:
            df = local_df
            counter += 1
        else:
            df = pd.concat([df,local_df])
            
    return df

In [4]:
def clean_df(df):
    
    df = df.replace("///", np.nan)
    df = df.replace("×", np.nan)
    df = df.replace("", np.nan)
    df = df.replace(" ", np.nan)
    df = df.replace(np.nan,"nan")
    
    for i in np.arange(0,len(df.columns)):
        df.iloc[:,i] = df.iloc[:,i].map(str)
        df.iloc[:,i] = df.iloc[:,i].map(lambda x: re.split("\)|\]|\.|\uff08",x)[0])

    for i in np.arange(0,len(df.columns)):
        df.iloc[:,i] = df.iloc[:,i].map(float)
        
        
    #modify index names
    df.reset_index(inplace=True) 
    
    #remove "*" from column names    
    df["point"] = df["point"].map(lambda x: re.split("\*|\.",x)[0])
    
    #replace some point names
    df = df.replace("奥日光（日光）", "奥日光")
    df = df.replace("南大東（南大東島）", "南大東")
    df = df.replace("つくば（館野）", "つくば")
    
    df.set_index(["area","point"], inplace=True)
        
    return df