# Collect Weather Statistics

From [the World Weather Information Service](https://worldweather.wmo.int/en/dataguide.html), it provides API
about forecast and climatological information of each city. It is the API path that `https://worldweather.wmo.int/en/json/[City ID]_en.json` and `[City ID]` is available at [this list](./full_city_list.txt). JSON's Schema information is [here](./WWIS_json_schema_v2.json).

## Update Cities of Universities

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = None

UNIV_PATHNAME = '../univ-with-qs.csv'
df_univ = pd.read_csv(UNIV_PATHNAME)
df_univ.head()

Unnamed: 0,sequence,name,region,country,city,official-link,established,student-number,faculty-number,campus-size,map-iframe,loc,tot-rank,tot-score,tot-international-students-ratio,tot-international-faculty-ratio,tot-faculty-student ratio,tot-citations-per-faculty,tot-academic-reputation,tot-employer-reputation,cse-rank,cse-overall-score,cse-h-index-citations,cse-citations-per paper,cse-academic-reputation,cse-employer-reputation,hum-rank,hum-overall-score,hum-h-index-citations,hum-citations-per-paper,hum-academic-reputation,hum-employer-reputation
0,1,GE3(Global Engineering Education Exchange),Others,,,https://globale3.studioabroad.com,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2,ISEP(International Student Exchange Programs),Others,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,3,University of Twente,Europe,Netherlands,Enschede,http://www.utwente.nl/,1961.0,12544.0,3150.0,146.0,https://www.google.com/maps/embed?pb=!1m18!1m1...,"Enschede,Netherlands",=189,45.5,89.1,98.8,37.1,79.3,21.0,42.3,151-200,,,,,,151-200,,,,,
3,4,Amsterdam University of Applied Sciences,Europe,Netherlands,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,5,Amsterdam University of Applied Sciences,Europe,Netherlands,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [2]:
# Count missing cities
print('missing', len(df_univ[pd.isna(df_univ['city'])]))

# Count available cities
print('available', len(df_univ[pd.isna(df_univ['city']) & ~pd.isna(df_univ['loc'])]))

missing 84
available 36


In [3]:
# Row Index
rowIdx = pd.isna(df_univ['city']) & ~pd.isna(df_univ['loc'])

# Regex replace
df_univ.loc[rowIdx, 'loc'].str.replace(',(.+)$', '', regex=True)[:3]

23    Luxembourg
24        Kaunas
25      Vilnius 
Name: loc, dtype: object

In [4]:
# Assign cities
df_univ.loc[rowIdx, 'city'] = df_univ.loc[rowIdx, 'loc'].str.replace(',(.+)$', '', regex=True)

# Trim
df_univ['city'] = df_univ['city'].str.strip()

df_univ[rowIdx].head()

Unnamed: 0,sequence,name,region,country,city,official-link,established,student-number,faculty-number,campus-size,map-iframe,loc,tot-rank,tot-score,tot-international-students-ratio,tot-international-faculty-ratio,tot-faculty-student ratio,tot-citations-per-faculty,tot-academic-reputation,tot-employer-reputation,cse-rank,cse-overall-score,cse-h-index-citations,cse-citations-per paper,cse-academic-reputation,cse-employer-reputation,hum-rank,hum-overall-score,hum-h-index-citations,hum-citations-per-paper,hum-academic-reputation,hum-employer-reputation
23,24,University of Luxembourg,Europe,Luxembourg,Luxembourg,,,,,,,"Luxembourg,Luxembourg",,,,,,,,,251-300,,82.7,88.0,,,251-300,,82.7,88.0,,
24,25,Kaunas University of Technology,Europe,Lithuania,Kaunas,,,,,,,"Kaunas,Lithuania",801-1000,-,11.2,1.4,32.8,5.4,11.3,16.3,,,,,,,,,,,,
25,26,Vilnius Gediminas Technical University,Europe,Lithuania,Vilnius,,,,,,,"Vilnius ,Lithuania",,,,,,,,,501-550,,,,,,501-550,,,,,
27,47,Linkoping University,Europe,Sweden,Linköping,,,,,,,"Linköping,Sweden",329,32.5,26.5,53.6,29.8,57.8,20.1,27.9,201-250,,82.2,91.1,,,201-250,,82.2,91.1,,
28,48,University of Lausanne,Europe,Swiss,Lausanne,,,,,,,"Lausanne,Switzerland",176,46.6,61.7,99.8,52.4,63.9,32.8,19.0,,,,,,,,,,,,


In [9]:
# Get city list
df_ucity = df_univ[['country', 'city']].dropna().drop_duplicates(ignore_index=True)
df_ucity.rename(columns={'country': 'univ-country', 'city': 'univ-city'}, inplace=True)
df_ucity.head()

Unnamed: 0,univ-country,univ-city
0,Netherlands,Enschede
1,Germany,Konstanz
2,Luxembourg,Luxembourg
3,Lithuania,Kaunas
4,Lithuania,Vilnius


## Load City List of Weather

In [10]:
CITY_PATHNAME = './full_city_list.txt'

df_list = pd.read_csv(CITY_PATHNAME, delimiter=';')
df_list.drop(index=len(df_list)-1, inplace=True) # Last Row is updated time
df_list.tail()

Unnamed: 0,Country,City,CityId
2965,Zimbabwe,Mount Darwin,956.0
2966,Zimbabwe,Mutare,954.0
2967,Zimbabwe,Plumtree,960.0
2968,Zimbabwe,Rusape,959.0
2969,Zimbabwe,Victoria Falls,962.0


In [12]:
df_city = pd.merge(left=df_ucity, right=df_list, how='left',
                    left_on='univ-city', right_on='City')
df_city[~pd.isna(df_city['CityId'])].head()

Unnamed: 0,univ-country,univ-city,Country,City,CityId
2,Luxembourg,Luxembourg,Luxembourg,Luxembourg,192.0
3,Lithuania,Kaunas,Lithuania,Kaunas,607.0
4,Lithuania,Vilnius,Lithuania,Vilnius,204.0
8,Spain,Madrid,Spain,Madrid,195.0
9,UK,Sheffield,Australia,Sheffield,1821.0


### Mismatched data

In [14]:
df_city[(df_city['univ-country'] != df_city['Country']) & (~pd.isna(df_city['CityId']))]

Unnamed: 0,univ-country,univ-city,Country,City,CityId
9,UK,Sheffield,Australia,Sheffield,1821.0
10,UK,Leeds,United Kingdom of Great Britain and Northern I...,Leeds,39.0
16,Brunei,Bandar Seri Begawan,Brunei Darussalam,Bandar Seri Begawan,95.0
19,Hong Kong,Hong Kong,"Hong Kong, China",Hong Kong,1.0
23,United States of America,Portland,Australia,Portland,1720.0
27,Canada,Toronto,Australia,Toronto,1739.0


In [24]:
# UK Sheffield -> UK Manchester (nearby)
df_city.loc[9, ['Country', 'City', 'CityId']] = df_list[df_list['City'] == 'Manchester'].values[0]

# US Portland -> US Portland, Origon
df_city.loc[23, ['Country', 'City', 'CityId']] = df_list[df_list['City'] == 'Portland, Oregon'].values[0]

# Canada Toronto -> Canada Toronto, Ontario
df_city.loc[27, ['Country', 'City', 'CityId']] = df_list[df_list['City'] == 'Toronto, Ontario'].values[0]

### Missing data

In [29]:
def set_city(city_idx, list_idx):
    df_city.loc[city_idx, ['Country', 'City', 'CityId']] = df_list.loc[list_idx].values

In [25]:
df_city[pd.isna(df_city['CityId'])]

Unnamed: 0,univ-country,univ-city,Country,City,CityId
0,Netherlands,Enschede,,,
1,Germany,Konstanz,,,
5,Sweden,Linköping,,,
6,Swiss,Lausanne,,,
7,Swiss,St. Gallen,,,
11,Italy,Genoa,,,
12,Italy,Milan,,,
20,United States of America,Raleigh,,,
21,United States of America,Tempe,,,
22,United States of America,Blacksburg,,,


In [74]:
# Mapping city with CityId
mapper = [
    [0, 1963], [1, 2531], [5, 2524], [6, 2529], [7, 2531],
    [11, 1691], [12, 1694], [20, 2832], [21, 2895], [22, 2838],
    [24, 2659], [25, 2722], [26, 2717], [28, 1082], [30, 2044], [33, 140]
]

for c, l in mapper:
    set_city(c, l)

## Load Data

In [80]:
# Get ID list
id_list = df_city['CityId'].unique().astype(int).tolist()
print(id_list)

[145, 312, 192, 607, 204, 1043, 193, 195, 31, 39, 1948, 603, 48, 47, 82, 95, 234, 2237, 1, 813, 848, 817, 810, 268, 757, 753, 264, 634, 1062, 1395, 1498, 320, 1676, 300, 402]


In [126]:
import requests
import time

output_arr = list()

for id in id_list:
    # Request
    r = requests.get(f"https://worldweather.wmo.int/en/json/{id}_en.json")
    output_arr.append(r.json()['city'])
    
    # Suspend 1 sec
    print(id, r.json()['city']['cityName'], end=' ')
    time.sleep(1)

145 Twenthe 312 Zurich 192 Luxembourg 607 Kaunas 204 Vilnius 1043 Norrkoping 193 Geneva 195 Madrid 31 Manchester 39 Leeds 1948 Genova 603 Milan (MILANO) 48 Ankara 47 Istanbul 82 Kuala Lumpur 95 Bandar Seri Begawan 234 Singapore 2237 Bandung 1 Hong Kong 813 Raleigh & Durham, North Carolina 848 Yuma, Arizona 817 Roanoke, Virginia 810 Portland, Oregon 268 Atlanta, Georgia 757 Flagstaff, Arizona 753 Eugene, Oregon 264 Toronto, Ontario 634 Montreal, Quebec 1062 Belo Horizonte 1395 Buraimi 1498 Kuwait City 320 Brisbane 1676 Glenelg 300 Sydney 402 Geelong 

In [105]:
# Save JSON

import json

WEATHER_PATHNAME = './city-weather.json'

with open(WEATHER_PATHNAME, 'w') as f:
    json.dump(output_arr, f)

In [108]:
with open(WEATHER_PATHNAME, 'r') as f:
    json_data = json.load(f)

assert len(json_data) == len(id_list)

## Save Mapper Info

In [122]:
df_map = df_univ[['sequence', 'name', 'region', 'country', 'city']]
df_map.head()

Unnamed: 0,sequence,name,region,country,city
0,1,GE3(Global Engineering Education Exchange),Others,,
1,2,ISEP(International Student Exchange Programs),Others,,
2,3,University of Twente,Europe,Netherlands,Enschede
3,4,Amsterdam University of Applied Sciences,Europe,Netherlands,
4,5,Amsterdam University of Applied Sciences,Europe,Netherlands,


In [123]:
df_map = pd.merge(left=df_map, right=df_city, how='left',
                  left_on=['country', 'city'], right_on=['univ-country', 'univ-city'])
df_map.head()

Unnamed: 0,sequence,name,region,country,city,univ-country,univ-city,Country,City,CityId
0,1,GE3(Global Engineering Education Exchange),Others,,,,,,,
1,2,ISEP(International Student Exchange Programs),Others,,,,,,,
2,3,University of Twente,Europe,Netherlands,Enschede,Netherlands,Enschede,Netherlands,Twenthe,145.0
3,4,Amsterdam University of Applied Sciences,Europe,Netherlands,,,,,,
4,5,Amsterdam University of Applied Sciences,Europe,Netherlands,,,,,,


In [125]:
MAPPER_PATHNAME = 'univ-city.csv'

df_map.to_csv(MAPPER_PATHNAME, index=False,
              columns=['sequence', 'name', 'region', 'country', 'city', 'CityId'],
             float_format='%.0f')