In [1]:
import joblib
import sklearn
import dask.dataframe as dd
import numpy as np
import pandas as pd
import os
import warnings
import matplotlib.pyplot as plt
import time

In [4]:
columns = ['SP', 'RH', 'T_2M']

In [27]:
# Settings
data_parts = ['TRAIN', 'VALIDATION', 'TEST']
batch_size = 10_000_000


# Process all datasets in chunks
for part in data_parts:
    cities_file = f'data/CLUSTER1_{part}_cleaned_cities.csv'
    output_file = f'data/CLUSTER1_{part}_SH.csv'

    print(f"\nStarting processing for {part} dataset...")

    cities_iter = pd.read_csv(cities_file, usecols=columns, chunksize=batch_size)

    chunk_counter = 0
    start_time = time.time() 
    for cities_chunk in cities_iter:
        chunk_counter += 1
        
        e_s = 6.112 * np.exp((17.67*(cities_chunk['T_2M']-273.15))/(cities_chunk['T_2M'] -273.15 + 243.5))
        e = cities_chunk['RH']/100 * e_s
        cities_chunk['SH'] = 0.622 * e /(cities_chunk['SP']/100 - (1-0.622)*e)

        # Append to output CSV
        cities_chunk[['SH']].to_csv(output_file, mode='a', index=False, header=chunk_counter == 1)


        # Calculate time taken for this chunk
        chunk_time = time.time() - start_time
        print(f"Processed {part} chunk {chunk_counter} in {chunk_time:.2f} seconds.")
        start_time = time.time()



Starting processing for TRAIN dataset...
Processed TRAIN chunk 1 in 41.76 seconds.
Processed TRAIN chunk 2 in 42.36 seconds.
Processed TRAIN chunk 3 in 49.08 seconds.
Processed TRAIN chunk 4 in 48.39 seconds.
Processed TRAIN chunk 5 in 44.95 seconds.
Processed TRAIN chunk 6 in 42.26 seconds.
Processed TRAIN chunk 7 in 43.10 seconds.
Processed TRAIN chunk 8 in 43.86 seconds.
Processed TRAIN chunk 9 in 42.34 seconds.
Processed TRAIN chunk 10 in 47.15 seconds.
Processed TRAIN chunk 11 in 49.95 seconds.
Processed TRAIN chunk 12 in 49.56 seconds.
Processed TRAIN chunk 13 in 45.05 seconds.
Processed TRAIN chunk 14 in 41.12 seconds.
Processed TRAIN chunk 15 in 38.60 seconds.
Processed TRAIN chunk 16 in 39.50 seconds.
Processed TRAIN chunk 17 in 38.46 seconds.
Processed TRAIN chunk 18 in 38.77 seconds.
Processed TRAIN chunk 19 in 39.36 seconds.
Processed TRAIN chunk 20 in 38.30 seconds.
Processed TRAIN chunk 21 in 23.55 seconds.

Starting processing for VALIDATION dataset...
Processed VALIDAT

In [None]:
new = pd.read_csv('data/CLUSTER2_TEST_SH.csv')
print(min(new['SH']), max(new['SH']))

0.0007761358150904 0.0205094451356466


In [20]:
new = pd.read_csv('data/CLUSTER2_TRAIN_SH.csv')
print(min(new['SH']), max(new['SH']))

0.0007270362004324 0.0211888063360212


In [21]:
new = pd.read_csv('data/CLUSTER2_VALIDATION_SH.csv')
print(min(new['SH']), max(new['SH']))

0.0011889979292697 0.0188543535228438


In [23]:
new = pd.read_csv('data/CLUSTER3_TEST_SH.csv')
print(min(new['SH']), max(new['SH']))

0.0001609622531912 0.0178455094461258


In [24]:
new = pd.read_csv('data/CLUSTER3_TRAIN_SH.csv')
print(min(new['SH']), max(new['SH']))

0.0001923270033781 0.0187589331509791


In [26]:
new = pd.read_csv('data/CLUSTER3_VALIDATION_SH.csv')
print(min(new['SH']), max(new['SH']))

0.000590144063 0.0161868354128747


In [28]:
new = pd.read_csv('data/CLUSTER1_TEST_SH.csv')
print(min(new['SH']), max(new['SH']))

0.0005002264751913 0.0198065411978871


In [29]:
new = pd.read_csv('data/CLUSTER1_TRAIN_SH.csv')
print(min(new['SH']), max(new['SH']))

0.0002805140023298 0.0232567166249862


In [30]:
new = pd.read_csv('data/CLUSTER1_VALIDATION_SH.csv')
print(min(new['SH']), max(new['SH']))

0.0011266362097941 0.0187219053889624
