In [1]:
import os
import pandas as pd
import pickle
from datetime import datetime
from tqdm.notebook import tqdm
from functools import lru_cache
import seaborn as sns

In [2]:
# Dataset directory
abs_dir = os.getcwd()
dataset_dir = abs_dir + '\\data'
dir_list = os.listdir(dataset_dir) # ['0', '1', '2', '3', '4', '5', '6', '7', '8']
dir_dict = {}
for l in dir_list: dir_dict[l] = os.listdir(f'{dataset_dir}/{l}')
labels=['class', 'P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP', 'P-JUS-CKGL', 'T-JUS-CKGL', 'QGL','source','filename']

In [3]:
@lru_cache
def source_filter(s:str):
    if s.startswith('WELL'): return 'WELL'
    elif s.startswith('SIM'): return 'SIM'
    elif s.startswith('DRAWN'): return 'DRAWN'

In [12]:
lst_all = []
for key in dir_dict.keys():
    lst = [pd.DataFrame(columns=labels)]
    for l in tqdm(dir_dict[key],desc='Class '+key):
        df_ = pd.read_csv(f"{dataset_dir}/{key}/{l}", engine="pyarrow")
        df_['source'] = source_filter(l)
        df_['filename'] = l
        lst.append(df_)
    df = pd.concat(lst, axis=0)
    lst_all.append(df)
    # cache
    with open(os.path.join('./cached',f'df_{key}.pkl'), 'wb') as f:
        pickle.dump(df, f)

print('\nNow caching all of the dataset...\n')
df_all = pd.concat(lst_all, axis=0)
with open(os.path.join('./cached',f'df_all.pkl'), 'wb') as f:
    pickle.dump(df_all, f)
    print('Complete')

Class 0:   0%|          | 0/597 [00:00<?, ?it/s]

Class 1:   0%|          | 0/129 [00:00<?, ?it/s]

Class 2:   0%|          | 0/38 [00:00<?, ?it/s]

Class 3:   0%|          | 0/106 [00:00<?, ?it/s]

Class 4:   0%|          | 0/344 [00:00<?, ?it/s]

Class 5:   0%|          | 0/451 [00:00<?, ?it/s]

Class 6:   0%|          | 0/221 [00:00<?, ?it/s]

Class 7:   0%|          | 0/14 [00:00<?, ?it/s]

Class 8:   0%|          | 0/84 [00:00<?, ?it/s]


Now caching all of the dataset...

Complete


In [8]:
with open(os.path.join('./cached',f'df_all.pkl'), 'rb') as f:
    df = pickle.load(f)

In [14]:
df_all.describe()

Unnamed: 0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,QGL
count,50907310.0,50907030.0,45102240.0,49791590.0,49210430.0,10098580.0,10782350.0
mean,-4.9143729999999997e+39,15759770.0,104.2409,3584212.0,75.59043,19177710.0,0.1681906
std,7.599591999999999e+40,59720200.0,27.59764,3353031.0,21.47815,159300000.0,0.4574544
min,-1.180116e+42,0.0,0.0,-8317.492,-2.02,-497671.7,0.0
25%,11561450.0,10948880.0,96.95686,1186354.0,67.02149,2313529.0,0.0
50%,21892680.0,14521920.0,116.7489,1956448.0,77.35565,2332504.0,0.0
75%,26062050.0,17559360.0,121.7031,5111470.0,84.78074,3462213.0,0.0
max,3101295000.0,2941990000.0,127.7401,13037170.0,173.0961,1831428000.0,4.146513


In [15]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50913215 entries, 0 to 24203
Data columns (total 12 columns):
 #   Column      Dtype         
---  ------      -----         
 0   class       object        
 1   P-PDG       float64       
 2   P-TPT       float64       
 3   T-TPT       float64       
 4   P-MON-CKP   float64       
 5   T-JUS-CKP   float64       
 6   P-JUS-CKGL  float64       
 7   T-JUS-CKGL  object        
 8   QGL         float64       
 9   source      object        
 10  filename    object        
 11  timestamp   datetime64[ns]
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 4.9+ GB


In [20]:
df_all.count()

class         50908085
P-PDG         50907310
P-TPT         50907031
T-TPT         45102239
P-MON-CKP     49791587
T-JUS-CKP     49210426
P-JUS-CKGL    10098579
T-JUS-CKGL           0
QGL           10782351
source        50913215
filename      50913215
timestamp     50913215
dtype: int64

T-JUS_CKGL column is the group of None values.

In [22]:
df_new = df_all.drop(columns=['T-JUS-CKGL'], axis=1)
df_new = df_new.dropna(subset=['class'], how='any', axis=0)
df_new['class'] = df_new['class'].astype('int')
df_new.count()

class         50908085
P-PDG         50902180
P-TPT         50901901
T-TPT         45097109
P-MON-CKP     49786974
T-JUS-CKP     49206552
P-JUS-CKGL    10094515
QGL           10778247
source        50908085
filename      50908085
timestamp     50908085
dtype: int64

In [31]:
one_hot_classes = pd.get_dummies(df_new['class'])
one_hot_sources = pd.get_dummies(df_new['source'])
df_cut = df_new.drop(columns=['class', 'filename'], axis=1)
df_cut = pd.concat([df_cut, one_hot_classes, one_hot_sources], axis=0)

MemoryError: Unable to allocate 5.69 GiB for an array with shape (15, 50908085) and data type float64

In [33]:
df_cut = pd.concat([df_cut, one_hot_classes, one_hot_sources], axis=0)

MemoryError: Unable to allocate 7.97 GiB for an array with shape (7, 152724255) and data type float64

In [136]:
df_all['class']

0          0
1          0
2          0
3          0
4          0
        ... 
24199    8.0
24200    8.0
24201    8.0
24202    8.0
24203    8.0
Name: class, Length: 50913215, dtype: object