# GHCN Analysis

In [3]:
import os
from glob import glob
from copy import copy 

import dask
import dask.bag as db
import dask.dataframe as dd

import pandas as pd

from distributed import Client
from dask_jobqueue import SLURMCluster

from IPython.display import display
import matplotlib.pyplot as plt

import warnings 
warnings.filterwarnings('ignore')

from dask import config as cfg

cfg.set({'distributed.scheduler.worker-ttl': None})

<dask.config.set at 0x14f8ba812280>

In [4]:
# Set LOCAL to True for single-machine execution while developing
# Set LOCAL to False for cluster execution
LOCAL = True

if LOCAL:
    # This line creates a single-machine dask client
    client = Client()
else:    
    # This line creates a SLURM cluster dask and dask client
    # Logging outputs will be stored in /scratch/{your-netid}
    
    cluster = SLURMCluster(
       # Memory and core limits should be sufficient here
       memory='32GB', cores=16,

       # Ensure that Dask uses the correct version of Python on the cluster
       python='/scratch/work/public/dask/{}/bin/python'.format(dask.__version__),                           

       # Place the output logs in an accessible location
       job_extra=['--export=NONE --output=/scratch/{}/slurm-%j.out'.format(os.environ['SLURM_JOB_USER'])]
    )

    cluster.submit_command = 'slurm'
    cluster.scale(50)

    display(cluster)
    client = Client(cluster)

display(client)

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 4.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:45135,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 4.00 GiB

0,1
Comm: tcp://127.0.0.1:42691,Total threads: 2
Dashboard: http://127.0.0.1:40267/status,Memory: 1.00 GiB
Nanny: tcp://127.0.0.1:43443,
Local directory: /home/jes9857/final-project-group-69/dask-worker-space/worker-28_x14kt,Local directory: /home/jes9857/final-project-group-69/dask-worker-space/worker-28_x14kt

0,1
Comm: tcp://127.0.0.1:36613,Total threads: 2
Dashboard: http://127.0.0.1:40101/status,Memory: 1.00 GiB
Nanny: tcp://127.0.0.1:46539,
Local directory: /home/jes9857/final-project-group-69/dask-worker-space/worker-mmwwbyhm,Local directory: /home/jes9857/final-project-group-69/dask-worker-space/worker-mmwwbyhm

0,1
Comm: tcp://127.0.0.1:32985,Total threads: 2
Dashboard: http://127.0.0.1:45597/status,Memory: 1.00 GiB
Nanny: tcp://127.0.0.1:36547,
Local directory: /home/jes9857/final-project-group-69/dask-worker-space/worker-n2puyxee,Local directory: /home/jes9857/final-project-group-69/dask-worker-space/worker-n2puyxee

0,1
Comm: tcp://127.0.0.1:41365,Total threads: 2
Dashboard: http://127.0.0.1:42623/status,Memory: 1.00 GiB
Nanny: tcp://127.0.0.1:37243,
Local directory: /home/jes9857/final-project-group-69/dask-worker-space/worker-gqeja5fv,Local directory: /home/jes9857/final-project-group-69/dask-worker-space/worker-gqeja5fv


In [43]:
!ls -lh /scratch/work/courses/DSGA1004-2021/listenbrainz/

total 15G
-rw-r--r--. 1 2502497 users 1.9G Apr 11 22:40 interactions_test.parquet
-rw-r--r--. 1 2502497 users 6.8G Apr 11 22:41 interactions_train.parquet
-rw-r--r--. 1 2502497 users 1.9G Apr 11 22:41 interactions_train_small.parquet
-rw-r--r--. 1 2502497 users 767M Apr 11 22:41 tracks_test.parquet
-rw-r--r--. 1 2502497 users 2.1G Apr 11 22:41 tracks_train.parquet
-rw-r--r--. 1 2502497 users 1.5G Apr 11 22:41 tracks_train_small.parquet
-rw-r--r--. 1 2502497 users 128K Apr 11 22:41 users_test.parquet
-rw-r--r--. 1 2502497 users 141K Apr 11 22:41 users_train.parquet
-rw-r--r--. 1 2502497 users 124K Apr 11 22:41 users_train_small.parquet


In [41]:
train_small = dd.read_parquet("/scratch/work/courses/DSGA1004-2021/listenbrainz/tracks_train_small.parquet",
                              engine='pyarrow')

In [42]:
train_small.tail()



KilledWorker: ("('read-parquet-0e6bde09597561b65850ed2fbe0b1358', 0)", <WorkerState 'tcp://127.0.0.1:39275', name: 0, status: closed, memory: 0, processing: 1>)

In [11]:
def calculate_max_difference(
    files: list, 
    initial_partitions: int = int(1e6),
    num_workers: int = None, 
) -> None:
            
    def filter_values(x):
        return (x['element'] in ['TMAX', 'TMIN'] 
                and x['value'] != -9999 
                and x['quality'] == ' ')
        
        
    def set_default(x):
        """Cleans each entry to work better with foldby
        """
        x = copy(x)

        x['TMAX'] = x['value']
        x['TMIN'] = x['value']
        x['id'] =  "_".join(
            [x['station_id'], str(x['year']), str(x['month']), str(x['day'])]
        )
        
        return {k: x[k] for k in ('TMAX', 'TMIN', 'id')}
        

    def combine_entries(x, y):
        x = copy(x)

        tmax = max(x['TMAX'], y['TMAX'])
        tmin = min(x['TMIN'], y['TMIN'])

        x['TMAX'] = tmax 
        x['TMIN'] = tmin

        return x
    
    def get_biggest_diff(x, y):
        diffx = x['TMAX'] - x['TMIN']
        diffy = y['TMAX'] - y['TMIN']

        if diffx > diffy:
            return x
        else:
            return y
        
    def get_station(x):
        x['station_id'] = x['id'].split("_")[0]
        del x['id']
        return x 

            
    # Operate on subset of the files at any given time 
    dfs = []
    
    
    bag = (db.from_sequence(files, npartitions=initial_partitions).map(load_daily)
            .flatten()
            .filter(filter_values)
            .map(set_default)  
            .foldby(lambda x: x['id'], combine_entries, combine=combine_entries)
            .map(lambda x: x[1])
            .map(get_station))

    bag = pd.DataFrame(bag.compute(scheduler='processes', num_workers=num_workers))

    if len(bag) == 0:
        return pd.DataFrame()

    bag['t_range'] = bag['TMAX'] - bag['TMIN']
    bag = bag.groupby('station_id').max().reset_index()
    bag = bag[['station_id', 't_range']]
    
    if len(bag):
        return bag
    return pd.DataFrame()

# Tiny Data Results 

In [12]:
%%time 
files = sorted(glob('/scratch/work/courses/DSGA1004-2021/ghcnd_tiny/*.dly'))
sm_df = calculate_max_difference(files)
sm_df.to_parquet('tdiff-tiny.parquet')
sm_df

CPU times: user 472 ms, sys: 302 ms, total: 774 ms
Wall time: 2.02 s


Unnamed: 0,station_id,t_range
0,ASN00063226,274
1,CA002402688,272
2,CA004012040,339
3,MXN00018074,222
4,MXN00026133,311
5,MXN00031086,350
6,RMC00914399,122
7,USC00046773,317
8,USC00180565,239
9,USC00295691,372


# Small Data Results

In [7]:
%%time 
start = 0
files = sorted(glob('/scratch/work/courses/DSGA1004-2021/ghcnd_small/*.dly'))
meta_chunk = 200
iters = len(files) // meta_chunk + 1
tmp_dir = './tmp_small'

for i in range(start, iters):
    print("meta chunk", i)
    df = calculate_max_difference(files[i * meta_chunk: (i + 1) * meta_chunk])
    
    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir)
    df.to_csv(f'{tmp_dir}/data_{i}.csv')
    
small_df = pd.concat(
    [pd.read_csv(f'{tmp_dir}/{f}') for f in os.listdir(tmp_dir) if f.endswith('csv')], 
    ignore_index=True
)

# Handle duplicates
small_df = small_df.groupby('station_id').max().reset_index()
small_df = small_df[['station_id', 't_range']]
small_df.to_parquet('tdiff-small.parquet')
small_df

meta chunk 0
meta chunk 1
meta chunk 2
meta chunk 3
meta chunk 4
meta chunk 5
CPU times: user 18.7 s, sys: 5.64 s, total: 24.4 s
Wall time: 1min 3s


Unnamed: 0,station_id,t_range
0,AGM00060490,272.0
1,AJ000037639,233.0
2,AM000037686,285.0
3,ARM00087909,277.0
4,ASN00005094,180.0
...,...,...
343,USW00053167,279.0
344,USW00053169,218.0
345,USW00064776,278.0
346,USW00094724,299.0


# All Data Results

In [5]:
%%time 
start = 477
files = sorted(glob('/scratch/work/courses/DSGA1004-2021/ghcnd_all/*.dly'))
meta_chunk = 100
iters = len(files) // meta_chunk + 1
tmp_dir = './tmp_all'

for i in range(start, iters):
    print("meta chunk", i)
    df = calculate_max_difference(files[i * meta_chunk: (i + 1) * meta_chunk])
    
    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir)
    df.to_csv(f'{tmp_dir}/data_{i}.csv')
    
all_df = pd.concat(
    [pd.read_csv(f'{tmp_dir}/{f}') for f in os.listdir(tmp_dir) if f.endswith('csv')], 
    ignore_index=True
)
all_df = all_df.groupby('station_id').max().reset_index()
all_df = all_df[['station_id', 't_range']]
all_df.to_parquet('tdiff-all.parquet')
all_df

meta chunk 477
meta chunk 478
meta chunk 479
meta chunk 480
meta chunk 481
meta chunk 482
meta chunk 483
meta chunk 484
meta chunk 485
meta chunk 486
meta chunk 487
meta chunk 488
meta chunk 489
meta chunk 490
meta chunk 491
meta chunk 492
meta chunk 493
meta chunk 494
meta chunk 495
meta chunk 496
meta chunk 497
meta chunk 498
meta chunk 499
meta chunk 500
meta chunk 501
meta chunk 502
meta chunk 503
meta chunk 504
meta chunk 505
meta chunk 506
meta chunk 507
meta chunk 508
meta chunk 509
meta chunk 510
meta chunk 511
meta chunk 512
meta chunk 513
meta chunk 514
meta chunk 515
meta chunk 516
meta chunk 517
meta chunk 518
meta chunk 519
meta chunk 520
meta chunk 521
meta chunk 522
meta chunk 523
meta chunk 524
meta chunk 525
meta chunk 526
meta chunk 527
meta chunk 528
meta chunk 529
meta chunk 530
meta chunk 531
meta chunk 532
meta chunk 533
meta chunk 534
meta chunk 535
meta chunk 536
meta chunk 537
meta chunk 538
meta chunk 539
meta chunk 540
meta chunk 541
meta chunk 542
meta chunk

meta chunk 1022
meta chunk 1023
meta chunk 1024
meta chunk 1025
meta chunk 1026
meta chunk 1027
meta chunk 1028
meta chunk 1029
meta chunk 1030
meta chunk 1031
meta chunk 1032
meta chunk 1033
meta chunk 1034
meta chunk 1035
meta chunk 1036
meta chunk 1037
meta chunk 1038
meta chunk 1039
meta chunk 1040
meta chunk 1041
meta chunk 1042
meta chunk 1043
meta chunk 1044
meta chunk 1045
meta chunk 1046
meta chunk 1047
meta chunk 1048
meta chunk 1049
meta chunk 1050
meta chunk 1051
meta chunk 1052
meta chunk 1053
meta chunk 1054
meta chunk 1055
meta chunk 1056
meta chunk 1057
meta chunk 1058
meta chunk 1059
meta chunk 1060
meta chunk 1061
meta chunk 1062
meta chunk 1063
meta chunk 1064
meta chunk 1065
meta chunk 1066
meta chunk 1067
meta chunk 1068
meta chunk 1069
meta chunk 1070
meta chunk 1071
meta chunk 1072
meta chunk 1073
meta chunk 1074
meta chunk 1075
meta chunk 1076
meta chunk 1077
meta chunk 1078
meta chunk 1079
meta chunk 1080
meta chunk 1081
meta chunk 1082
meta chunk 1083
meta chu

Unnamed: 0,station_id,t_range
0,ACW00011604,106.0
1,ACW00011647,33.0
2,AE000041196,257.0
3,AEM00041194,244.0
4,AEM00041217,277.0
...,...,...
40288,ZI000067969,299.0
40289,ZI000067975,294.0
40290,ZI000067977,279.0
40291,ZI000067983,254.0
