# Импорт нужных инструментов


In [None]:
import ee
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ee.Authenticate()
ee.Initialize(project='pollutionmlproject')

In [None]:
!pip install earthengine-api



# Выгрузка данных

In [None]:
# --- ПАРАМЕТРЫ ГОРОДА ---
CITY_GEOM = ee.Geometry.Point(73.5, 61.25).buffer(25000)
CITY_NAME = 'Surgut'

# ==============================================================================
# == УСТАНОВИТЕ ГОД ДЛЯ ВЫГРУЗКИ ===============================================
# ==============================================================================
YEAR_TO_DOWNLOAD = 2024 # <--- МЕНЯЙТЕ ТОЛЬКО ЭТО ЗНАЧЕНИЕ (2020, 2021 и т.д.)
# ==============================================================================

# --- Параметры фильтрации S5P (без изменений) ---
PIXEL_CLOUD_THRESHOLD = 0.5 # При смене региона этот параметр трогать не следует
SOLAR_ZENITH_ANGLE_THRESHOLD = 80.0

# ==============================================================================
# == БЛОК 2: ФУНКЦИИ-СБОРЩИКИ ДАННЫХ (ERA5 функция обновлена) ==================
# ==============================================================================

def get_s5p_data_for_date(target_date, geometry):
    """Функция для S5P остается без изменений."""
    try:
        start = ee.Date(target_date)
        end = start.advance(1, 'day')
        s5p_coll = ee.ImageCollection('COPERNICUS/S5P/OFFL/L3_NO2') \
            .filterBounds(geometry).filterDate(start, end)

        if s5p_coll.size().getInfo() == 0: return None
        image = s5p_coll.mean()
        cloud_mask = image.select('cloud_fraction').lt(PIXEL_CLOUD_THRESHOLD)
        sza_mask = image.select('solar_zenith_angle').lt(SOLAR_ZENITH_ANGLE_THRESHOLD)
        combined_mask = cloud_mask.And(sza_mask)
        image_filtered = image.updateMask(combined_mask)
        data = image_filtered.reduceRegion(reducer=ee.Reducer.mean(), geometry=geometry, scale=1000, maxPixels=1e9).getInfo()
        cloud_info = image.select('cloud_fraction').reduceRegion(reducer=ee.Reducer.mean(), geometry=geometry, scale=1000, maxPixels=1e9).getInfo()
        return {
            'no2_trop_mean': data.get('tropospheric_NO2_column_number_density'),
            'cloud_frac_mean': cloud_info.get('cloud_fraction')
        }
    except Exception as e:
        print(f"Ошибка S5P на дате {target_date}: {e}"); return None

def get_era5_data_for_date(target_date, geometry):
    """
    Получает данные ERA5 (погода) за одну дату.
    ИСПОЛЬЗУЕТСЯ ОБНОВЛЕННЫЙ И НАДЕЖНЫЙ НАБОР ДАННЫХ 'ECMWF/ERA5_LAND/HOURLY'.
    """
    try:
        start = ee.Date(target_date)
        end = start.advance(1, 'day')

        # Каналы, которые нам нужны из ERA5-Land
        era5_bands = ['temperature_2m', 'surface_pressure',
                      'u_component_of_wind_10m', 'v_component_of_wind_10m']

        # 1. Используем новую коллекцию 'ERA5_LAND/HOURLY'
        # 2. Усредняем почасовые данные в суточные с помощью .mean()
        image = ee.ImageCollection('ECMWF/ERA5_LAND/HOURLY') \
                  .filterBounds(geometry) \
                  .filterDate(start, end) \
                  .select(era5_bands) \
                  .mean() # <--- Ключевое изменение: усредняем 24 часа в 1 день

        if image:
            data = image.reduceRegion(
                reducer=ee.Reducer.mean(), geometry=geometry, scale=11132, maxPixels=1e9
            ).getInfo()

            # Переводим единицы измерения
            temp_celsius = data.get('temperature_2m') - 273.15 if data.get('temperature_2m') else None
            pressure_hpa = data.get('surface_pressure') / 100 if data.get('surface_pressure') else None

            return {
                'temperature_celsius': temp_celsius,
                'pressure_hpa': pressure_hpa,
                'u_wind_10m': data.get('u_component_of_wind_10m'),
                'v_wind_10m': data.get('v_component_of_wind_10m')
            }
        return None
    except Exception as e:
        print(f"Ошибка ERA5 на дате {target_date}: {e}"); return None

# ==============================================================================
# == БЛОК 3: ОСНОВНОЙ ЦИКЛ СБОРА ДАННЫХ =========================================
# ==============================================================================

all_data = []
print(f"Начинаю сбор данных для города: {CITY_NAME} за {YEAR_TO_DOWNLOAD} год")

start_of_period = f"{YEAR_TO_DOWNLOAD}-02-01"
end_of_period = f"{YEAR_TO_DOWNLOAD}-09-30"
print(f"--- Период: с {start_of_period} по {end_of_period} ---")
date_range = pd.date_range(start=start_of_period, end=end_of_period, freq='D')

for date in tqdm(date_range, desc=f"Год {YEAR_TO_DOWNLOAD}"):
    current_date_str = date.strftime('%Y-%m-%d')
    s5p_data = get_s5p_data_for_date(current_date_str, CITY_GEOM)
    era5_data = get_era5_data_for_date(current_date_str, CITY_GEOM)

    row = {'date': date, 'city': CITY_NAME}
    if s5p_data: row.update(s5p_data)
    if era5_data: row.update(era5_data)
    all_data.append(row)

# ==============================================================================
# == БЛОК 4: ФОРМИРОВАНИЕ И СОХРАНЕНИЕ ИТОГОВОГО ФАЙЛА ========================
# ==============================================================================
final_df = pd.DataFrame(all_data)
column_order = [
    'date', 'city', 'no2_trop_mean', 'cloud_frac_mean',
    'temperature_celsius', 'pressure_hpa', 'u_wind_10m', 'v_wind_10m'
]
final_df = final_df.reindex(columns=column_order)

output_filename = f'{CITY_NAME}_data_{YEAR_TO_DOWNLOAD}_Feb-Sep.csv'
final_df.to_csv(output_filename, index=False)

print(f"\nСбор данных за {YEAR_TO_DOWNLOAD} год завершен. Файл: {output_filename}")

Начинаю сбор данных для города: Surgut за 2024 год
--- Период: с 2024-02-01 по 2024-09-30 ---


Год 2024:   0%|          | 0/243 [00:00<?, ?it/s]


Сбор данных за 2024 год завершен. Файл: Surgut_data_2024_Feb-Sep.csv


In [None]:
final_df.tail(50)

Unnamed: 0,date,city,no2_trop_mean,cloud_frac_mean,temperature_celsius,pressure_hpa,u_wind_10m,v_wind_10m
193,2024-08-12,Surgut,1.854157e-05,0.488792,13.640145,999.175918,1.451037,2.843475
194,2024-08-13,Surgut,2.284526e-06,0.507241,15.946304,1000.017695,-2.783164,3.100897
195,2024-08-14,Surgut,,0.854181,15.500183,992.651401,-2.12122,3.624447
196,2024-08-15,Surgut,,0.759627,15.286912,992.386559,-0.490214,2.589002
197,2024-08-16,Surgut,2.022979e-05,0.34704,15.372191,991.90556,0.440946,1.381832
198,2024-08-17,Surgut,2.23918e-05,0.250228,16.521657,992.681094,1.170762,1.366391
199,2024-08-18,Surgut,1.955721e-05,0.171863,15.919426,994.470266,0.669123,1.342037
200,2024-08-19,Surgut,1.760804e-05,0.277452,16.01631,997.126834,-0.048014,0.071401
201,2024-08-20,Surgut,0.0001163379,0.316674,15.989474,999.341421,0.511697,-0.828627
202,2024-08-21,Surgut,3.6463e-05,0.14654,16.242089,1000.145907,-0.01755,-1.755461


In [None]:
final_df['no2_trop_mean'].isna().sum()

np.int64(78)

In [None]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date                 243 non-null    datetime64[ns]
 1   city                 243 non-null    object        
 2   no2_trop_mean        165 non-null    float64       
 3   cloud_frac_mean      241 non-null    float64       
 4   temperature_celsius  243 non-null    float64       
 5   pressure_hpa         243 non-null    float64       
 6   u_wind_10m           243 non-null    float64       
 7   v_wind_10m           243 non-null    float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 15.3+ KB
