In [40]:
import urllib.request
import pandas as pd
import os
from datetime import datetime
from io import StringIO

In [None]:
if not os.path.exists('vhi_data'):
    os.makedirs('vhi_data')

In [46]:
def download():
    for pid in range(1, 28):
        if any(f.startswith(f'vhi_id_{pid}_') for f in os.listdir('vhi_data')):
            continue
        url = f'https://www.star.nesdis.noaa.gov/smcd/emb/vci/VH/get_TS_admin.php?country=UKR&provinceID={pid}&year1=1981&year2=2024&type=Mean'
        now = datetime.now().strftime('%Y%m%d%H%M%S')
        urllib.request.urlretrieve(url, f'vhi_data/vhi_id_{pid}_{now}.csv')
        print(f'Завантажено область {pid}')

download()

In [None]:
### Зчитування завантажених файлів, видалення тегів, рядків з пропусками та значенням -1, зміна індексів областей з англ нумераціі на українську
### та філтрація даних з 81 року

In [47]:
ID_MAP = {1: 22, 2: 24, 3: 23, 4: 25, 5: 3, 6: 4, 7: 8, 8: 19, 9: 20, 10: 21,
          11: 9, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 6, 19: 1,
          20: 2, 21: 7, 22: 5, 23: 10, 24: 11, 25: 12}

def clean_data(path):
    frames = []
    for f in os.listdir(path):
        if not f.endswith('.csv'):
            continue
        with open(os.path.join(path, f), encoding='utf-8', errors='ignore') as file:
            lines = file.readlines()
        lines = [l.replace('<tt><pre>', '').replace('<br>', '').strip().rstrip(',')
                 for l in lines[2:] if l.strip() and not l.strip().startswith('<')]
        tmp = pd.read_csv(StringIO('\n'.join(lines)),
                          names=['Year', 'Week', 'SMN', 'SMT', 'VCI', 'TCI', 'VHI'])
        tmp['ID'] = ID_MAP.get(int(f.split('_')[2]))
        tmp = tmp.dropna()
        tmp = tmp[tmp['VHI'] != -1]
        tmp['Year'] = pd.to_numeric(tmp['Year'], errors='coerce').astype(int)
        frames.append(tmp[tmp['Year'] >= 1981])
    return pd.concat(frames, ignore_index=True)

df = clean_data('vhi_data')
df.head()

Unnamed: 0,Year,Week,SMN,SMT,VCI,TCI,VHI,ID
0,1982,2,0.063,261.53,55.89,38.2,47.04,21
1,1982,3,0.063,263.45,57.3,32.69,44.99,21
2,1982,4,0.061,265.1,53.96,28.62,41.29,21
3,1982,5,0.058,266.42,46.87,28.57,37.72,21
4,1982,6,0.056,267.47,39.55,30.27,34.91,21


In [None]:
### функція повертає всі тижневі значення для вказагої області та року

In [48]:
def vhi_year(df, area_id, year):
    return df[(df['ID'] == area_id) & (df['Year'] == year)]

vhi_year(df, area_id=1, year=2000)

Unnamed: 0,Year,Week,SMN,SMT,VCI,TCI,VHI,ID
20570,2000,1,0.048,264.46,15.6,32.12,23.86,1
20571,2000,2,0.053,264.21,23.0,33.13,28.07,1
20572,2000,3,0.061,264.66,31.45,32.84,32.14,1
20573,2000,4,0.069,265.0,37.21,36.04,36.62,1
20574,2000,5,0.08,265.82,40.92,39.39,40.16,1
20575,2000,6,0.092,267.76,42.73,37.61,40.17,1
20576,2000,7,0.103,269.57,44.19,35.69,39.94,1
20577,2000,8,0.115,271.1,47.29,36.62,41.95,1
20578,2000,9,0.132,272.59,53.05,37.85,45.45,1
20579,2000,10,0.146,274.02,56.45,40.64,48.54,1


In [None]:
### функція повертає всі дані для списку областей за вказаний діапазон років

In [49]:
def vhi_range(df, area_ids, year_from, year_to):
    return df[(df['ID'].isin(area_ids)) & (df['Year'] >= year_from) & (df['Year'] <= year_to)]

vhi_range(df, area_ids=[1, 2, 3], year_from=2000, year_to=2010)

Unnamed: 0,Year,Week,SMN,SMT,VCI,TCI,VHI,ID
20570,2000,1,0.048,264.46,15.60,32.12,23.86,1
20571,2000,2,0.053,264.21,23.00,33.13,28.07,1
20572,2000,3,0.061,264.66,31.45,32.84,32.14,1
20573,2000,4,0.069,265.00,37.21,36.04,36.62,1
20574,2000,5,0.080,265.82,40.92,39.39,40.16,1
...,...,...,...,...,...,...,...,...
43509,2010,48,0.089,269.49,52.44,30.52,41.48,3
43510,2010,49,0.075,265.82,50.07,37.06,43.57,3
43511,2010,50,0.066,263.86,47.75,36.75,42.25,3
43512,2010,51,0.056,262.12,46.08,37.63,41.85,3


In [None]:
### знаходить мінімальне, максимальне та середнє значення і медіану для вказаноі області та діапазону років

In [50]:
def get_extremes(df, area_id, year_from=None, year_to=None):
    subset = df[df['ID'] == area_id]
    if year_from:
        subset = subset[subset['Year'] >= year_from]
    if year_to:
        subset = subset[subset['Year'] <= year_to]
    vhi = subset['VHI']
    return vhi.min(), vhi.max(), round(vhi.mean(), 2), round(vhi.median(), 2)

vhi_min, vhi_max, vhi_mean, vhi_median = get_extremes(df, area_id=1, year_from=2000, year_to=2010)
print(f'Мін: {vhi_min}')
print(f'Макс: {vhi_max}')
print(f'Середнє: {vhi_mean}')
print(f'Медіана: {vhi_median}')

Мін: 20.7
Макс: 77.45
Середнє: 52.99
Медіана: 53.5
