In [1]:
%matplotlib inline
import requests
import json
import time
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn import set_config
set_config(display='diagram') # Để trực quan hóa pipeline


In [2]:
# Hàm thu thập dữ liệu từ một khoảng thời gian cụ thể
def collect_data(api_url):
    weather_data = []
    success = False
    while not success:
        print(f"GET: {api_url}")
        r = requests.get(api_url)
        if r.ok:
            # lấy kết quả json
            pydata = json.loads(r.text)
            weather_data.extend(pydata["days"])  # Lưu dữ liệu từ 'days'
            print("Successful! Remaining cost:", pydata.get('remainingCost', 'Unknown'))
            print()
            success = True
        else:
            print("Fail! Try again. Status code:", r.status_code)
            time.sleep(5)
        
    return weather_data


In [3]:
# Hàm tạo URL API từ các tham số đầu vào
def create_api_url(location, start_date, end_date, api_key, unit_group='metric', include='days'):
    return f'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{location}/{start_date}/{end_date}?unitGroup={unit_group}&include={include}&key={api_key}&contentType=json'

# Hàm thu thập dữ liệu trong khoảng thời gian từ nhiều năm
def collect_all_data(start_date, end_date, location, api_key):
    # Tạo URL API
    api_url = create_api_url(location, start_date, end_date, api_key)
    # Thu thập dữ liệu thời tiết
    weather_data = collect_data(api_url)
    
    return weather_data

In [4]:
# API key (có thể thay đổi)
api_key1 = 'TE7HPVTQ4Q4B7LWSLDSU38BRF'

# Thông số đầu vào (có thể thay đổi)
location = 'hanoi'
start_date = '2022-10-09'
end_date = '2024-10-01'

# Thu thập dữ liệu
all_weather_data = collect_all_data(start_date, end_date, location, api_key1)

# In số lượng dữ liệu thu thập được
print('Number of data:', len(all_weather_data))

GET: https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/hanoi/2022-10-09/2024-10-01?unitGroup=metric&include=days&key=TE7HPVTQ4Q4B7LWSLDSU38BRF&contentType=json
Successful! Remaining cost: Unknown

Number of data: 724


In [5]:
# Lưu dữ liệu vào DataFrame và xuất ra CSV
data_df = pd.DataFrame(all_weather_data)
data_df.rename(lambda name: name[0].upper() + name[1:], axis='columns', inplace=True)
data_df.to_csv('historical_weather_data.csv', index=False)

# Hiển thị 5 dòng đầu tiên của dữ liệu
data_df.head()

Unnamed: 0,Datetime,DatetimeEpoch,Tempmax,Tempmin,Temp,Feelslikemax,Feelslikemin,Feelslike,Dew,Humidity,...,Sunrise,SunriseEpoch,Sunset,SunsetEpoch,Moonphase,Conditions,Description,Icon,Stations,Source
0,2022-10-09,1665248400,27.3,23.0,25.3,29.9,23.0,25.7,20.8,76.8,...,05:49:52,1665269392,17:37:46,1665311866,0.48,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,"[48820099999, 48823099999, 48825099999, 488310...",obs
1,2022-10-10,1665334800,27.0,21.4,23.8,27.1,21.4,23.8,14.8,58.0,...,05:50:11,1665355811,17:36:54,1665398214,0.5,"Rain, Partially cloudy",Partly cloudy throughout the day with late aft...,rain,"[48820099999, 48823099999, 48825099999, 488310...",obs
2,2022-10-11,1665421200,28.7,19.0,23.9,28.2,19.0,23.8,14.7,59.1,...,05:50:31,1665442231,17:36:03,1665484563,0.54,Clear,Clear conditions throughout the day.,clear-day,"[48820099999, 48823099999, 48825099999, 488310...",obs
3,2022-10-12,1665507600,29.5,21.0,24.9,28.8,21.0,24.7,15.1,57.5,...,05:50:50,1665528650,17:35:13,1665570913,0.58,Clear,Clear conditions throughout the day.,clear-day,"[48820099999, 48823099999, 48825099999, 488310...",obs
4,2022-10-13,1665594000,29.7,22.0,25.0,28.6,22.0,24.8,15.2,57.1,...,05:51:10,1665615070,17:34:23,1665657263,0.61,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"[48820099999, 48823099999, 48825099999, 488310...",obs
