In [41]:
import urllib.request
from bs4 import BeautifulSoup
import re
import numpy as np
import datetime
import unicodedata
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import time
import json
import os

from tqdm import tqdm_notebook as tqdm
from ipywidgets import IntProgress

In [42]:
url = "http://agora.ex.nii.ac.jp/digital-typhoon/search_date.html"
nem = urllib.request.urlopen(url).read()
soup = BeautifulSoup(nem,"html.parser")

In [43]:
years1 = soup.find_all('td',class_='LISTING1')
years2 = soup.find_all('td',class_='LISTING0')

In [44]:
y1 = [i.text.replace('\n','').replace('年','') for i in years1][:35]
y2 = [i.text.replace('\n','').replace('年','') for i in years2][:34]
years = []
for i,j in zip(y1, y2):
    years.append(i)
    years.append(j)

In [45]:
url1 = [i.find('a').attrs['href'] for i in years1[:35]]
url2 = [i.find('a').attrs['href'] for i in years2[:34]]
urls = []
for i,j in zip(url1, url2):
    urls.append(i)
    urls.append(j)

In [46]:
years.append('2019')
urls.append('/digital-typhoon/year/wnp/2019.html.ja')

In [47]:
def TransDate(string):
    hoge, fuga = string.split(' ')
    year, month, day = hoge.split('-')
    hour, minute = fuga.split(':')
    summary = datetime.datetime(int(year),int(month),int(day),int(hour),int(minute))
    return summary

def TransDelta(string):
    day,hour = string.split('日')
    hour = hour.replace('時間','')
    return datetime.timedelta(hours=int(hour),days=int(day))

In [49]:
dic_array = {}
for u,y in zip(tqdm(urls[26:]),years[26:]):
    url = "http://agora.ex.nii.ac.jp"+u
    nem = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(nem,"html.parser")
    table_rows1 = soup.find_all('tr',class_='ROW1')
    table_rows2 = soup.find_all('tr',class_='ROW0')
    table_rows = []
    for r1,r2 in zip(table_rows1, table_rows2):
        table_rows.append(r1)
        table_rows.append(r2)
    if(len(table_rows1) > len(table_rows2)):
        table_rows.append(table_rows1[-1])
        
    os.mkdir('typhoon/'+str(y))
    
    dic = {y:[]}
    for row in table_rows:
        columns = row.find_all('td')
        birth = TransDate(columns[4].text)
        death = TransDate(columns[5].text)
        duration = TransDelta(columns[6].text)
        hPa = float(columns[7].text)
        
        url_low = columns[1].find('a').get('href')
        url_low = "http://agora.ex.nii.ac.jp"+url_low
        nem1 = urllib.request.urlopen(url_low).read()
        soup1 = BeautifulSoup(nem1,"html.parser")
        table = soup1.find_all('table',class_='METATAB')[0]
        tr = table.find_all('tr')
        wind = tr[12].text.replace(' (knots)\n\t', '')
        area1 = re.sub('\(km\)|\n| ', '', tr[14].text.split('\t')[2])
        area2 = re.sub('\(km\)|\n| ', '', tr[18].text.split('\t')[2])
        moving = re.sub('\(km\)|\n|\t| ', '', tr[22].text)
        mean_velocity = re.sub('\(km/h\)| ', '', tr[24].text.split('|')[0])
        ACE = tr[30].text
        
        d = {'birth': birth, 'death': death, 
             'duration': duration, 'pressure': hPa,
             'wind_velocity': float(wind), 'area1': float(area1),
             'area2': float(area2), 'moving': float(moving), 
             'mean_velocity':float(mean_velocity), 'ACE':float(ACE)}
        
        dic[y].append(d)
    
        num = str(columns[1].text)
        json_url = 'http://agora.ex.nii.ac.jp/digital-typhoon/geojson/wnp/'+num+'.ja.json'
        response = urllib.request.urlopen(json_url)
        content = json.loads(response.read().decode('utf8'))
        f = open("typhoon/"+str(y)+"/"+num+".json", "w")
        json.dump(content, f)
        f.close()
    
    dic_array.update(dic)

HBox(children=(IntProgress(value=0, max=43), HTML(value='')))




In [50]:
dic_array = np.array([dic_array])
np.save('typhoon/dataset.npy',dic_array)

In [10]:
dic_array[0]['1981']

[{'birth': datetime.datetime(1981, 3, 12, 12, 0),
  'death': datetime.datetime(1981, 3, 18, 6, 0),
  'duration': datetime.timedelta(5, 64800),
  'pressure': 945.0,
  'wind_velocity': 90.0,
  'area1': 230.0,
  'area2': 460.0,
  'moving': 3564.0,
  'mean_velocity': 25.8,
  'ACE': 105800.0},
 {'birth': datetime.datetime(1981, 4, 15, 18, 0),
  'death': datetime.datetime(1981, 4, 20, 12, 0),
  'duration': datetime.timedelta(4, 64800),
  'pressure': 980.0,
  'wind_velocity': 60.0,
  'area1': 90.0,
  'area2': 370.0,
  'moving': 1716.0,
  'mean_velocity': 15.1,
  'ACE': 43975.0},
 {'birth': datetime.datetime(1981, 4, 30, 12, 0),
  'death': datetime.datetime(1981, 5, 6, 6, 0),
  'duration': datetime.timedelta(5, 64800),
  'pressure': 996.0,
  'wind_velocity': 45.0,
  'area1': 0.0,
  'area2': 170.0,
  'moving': 1408.0,
  'mean_velocity': 10.2,
  'ACE': 37800.0},
 {'birth': datetime.datetime(1981, 6, 9, 0, 0),
  'death': datetime.datetime(1981, 6, 14, 18, 0),
  'duration': datetime.timedelta(5, 6