In [1]:
import pandas as pd
import codecs
from pykakasi import kakasi
import mojimoji

In [None]:
def jp2en(name):
    """convert from japanese character "katakana" to roman character"""
    
    text = name
    kakasi = kakasi()  # Generate kakasi instance

    text = mojimoji.han_to_zen(text,ascii=False)
    #kakasi.setMode("H", "a")  # Hiragana to ascii
    kakasi.setMode("K", "a")  # Katakana to ascii
    kakasi.setMode("J", "a")  # Japanese(kanji) to ascii

    kakasi.setMode("r", "Hepburn")  # Use Hepburn romanization

    conv = kakasi.getConverter()
    result = conv.do(text)
    result = str.capitalize(result)
    return result

In [None]:
def read_master_file():
    """read csv of ame_master.csv as observation_points"""
    
    with codecs.open("ame_master.csv", "r", "Shift-JIS", "ignore") as file:
        observation_points = pd.read_table(file, delimiter=",")

    #convert column names from japanese to english
    observation_points = observation_points.rename(columns={"都府県振興局":"area",
                                                            "観測所番号":"observation ID", 
                                                            "種類":"type",
                                                            "観測所名":"point",
                                                            "ｶﾀｶﾅ名":"katakana",
                                                            "所在地":"adress",
                                                            "緯度(度)":"latitude degree",
                                                            "緯度(分)":"latitude minute",
                                                            "経度(度)":"longitude degree",
                                                            "経度(分)":"longitude minute",
                                                            "海面上の高さ(ｍ)":"above sea level (m)",
                                                            "風速計の高さ(ｍ)":"anemometer height (m)",
                                                            "温度計の高さ(ｍ)":"thermometer height (m)",
                                                            "観測開始年月日":"observation beginning date",
                                                            "備考1":"remark1",
                                                            "備考2":"remark2"})
    
    #set indices with the name of area and observation point
    observation_points.set_index(["area","point"], inplace=True)
    
    return observation_points

In [2]:
def merge_data(df, observation_points):
    """merge df with observation_points"""
    
    #reduce columns in observation_points
    observation_points = observation_points[["katakana","latitude degree","latitude minute","longitude degree","longitude minute"]]

    #merge df with observation_points
    df = pd.merge(df, observation_points, left_index=True, right_on=["area","point"], how="left")
    
    return df

In [None]:
def process_data(df):
    """process df for analysis"""
    
    #remove rows without any information from observation_points
    df = df[df["katakana"].isna()==False]
    
    #convert from japanese character "katakana" to roman character
    df["katakana"] = df["katakana"].map(jp2en)

    #convert latitude into decimal number
    i = df["latitude minute"].map(int)
    f = df["latitude minute"] - df["latitude minute"].map(int)
    df["latitude"] = df["latitude degree"] + i / 60 + f / 60**2

    #convert longitude into decimal number
    i = df["longitude minute"].map(int)
    f = df["longitude minute"] - df["longitude minute"].map(int)
    df["longitude"] = df["longitude degree"] + i / 60 + f / 60**2
    
    return df

In [None]:
def create_csv(df):
    """create csv for df"""
    df.to_csv("japan_average_temp_1976-2020.csv")