In [1]:
# We have to prepare for this journey .... import modules is e great idea .... :)
import numpy as np
import pandas as pd

from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from bigdl.orca import init_orca_context, OrcaContext
from bigdl.orca.learn.pytorch import Estimator 
from bigdl.orca.learn.metrics import Accuracy

import bigdl.orca.data
import bigdl.orca.data.pandas
from bigdl.orca.data import SharedValue
from bigdl.orca.data import SparkXShards

from bigdl.orca.data.transform import *

import ray

In [2]:
# cluster_mode can be "local", "k8s" or "yarn"
sc = init_orca_context(cluster_mode="local", cores=4, memory="10g", num_nodes=1) 

Initializing orca context
Current pyspark location is : /home/yansu/miniconda3/envs/env/lib/python3.7/site-packages/pyspark/__init__.py
Start to getOrCreate SparkContext
pyspark_submit_args is:  --driver-class-path /home/yansu/Desktop/BigDL/dist/lib/bigdl-dllib-spark_2.4.6-2.1.0-SNAPSHOT-jar-with-dependencies.jar:/home/yansu/Desktop/BigDL/dist/lib/bigdl-friesian-spark_2.4.6-2.1.0-SNAPSHOT-jar-with-dependencies.jar:/home/yansu/Desktop/BigDL/dist/lib/bigdl-orca-spark_2.4.6-2.1.0-SNAPSHOT-jar-with-dependencies.jar pyspark-shell 
Successfully got a SparkContext


### Duplicate the dataframe

In [3]:
train = pd.read_csv('./train.csv', index_col = 'id')
train = train[~train.drop('target', axis = 1).duplicated()]
train.to_csv('./csv_train/train_fix.csv')

### Load data

In [4]:
file_path = './csv_train'
data_shard = bigdl.orca.data.pandas.read_csv(file_path)

### Shuffle data

In [5]:
RANDOM_STATE = 2021
def trans_func(df):
    return shuffle(df, random_state=RANDOM_STATE)
transformed_data_shard = data_shard.transform_shard(trans_func)

### Labelencode y

In [6]:
scale = LabelEncode(inputCol='target', outputCol="y_scaled")
transformed_data_shard = scale.fit_transform(transformed_data_shard)

### Split train and test set

In [7]:
TRAIN_SIZE = int(len(data_shard) * 0.8)
transformed_data_shard = transformed_data_shard.transform_shard(lambda df: (df[0:int(len(df)*0.8)], df[int(len(df)*0.8):]))
shards_splits = transformed_data_shard.split()
shards_train = shards_splits[0]
shards_val = shards_splits[1]

### Transform the feature columns

In [8]:
feature_list = []
for i in range(50):
    feature_list.append('feature_' + str(i))
scale = MinMaxScaler(inputCol=feature_list, outputCol="x_scaled")
shards_train = scale.fit_transform(shards_train)
shards_val = scale.transform(shards_val)

### Change data types

In [9]:
def trans_func(df):
    df['x_scaled'] = df['x_scaled'].apply(lambda x:torch.tensor(np.array(x),dtype=torch.float32))
    df['y_scaled'] = df['y_scaled'].apply(lambda x:torch.tensor(x,dtype=torch.long))
    return df
shards_train1 = shards_train.transform_shard(trans_func)
shards_val1 = shards_val.transform_shard(trans_func)

### Model

In [10]:
torch.manual_seed(0)
BATCH_SIZE = 64
NUM_FEATURES = len(train.columns)-1
NUM_CLASSES = 4
NUM_EPOCHS = 100

In [11]:
def linear_block(in_features, out_features, p_drop, *args, **kwargs):
    return nn.Sequential(
        nn.Linear(in_features, out_features),
        #nn.BatchNorm1d(out_features),
        nn.ReLU(),
        nn.Dropout(p = p_drop)
    )

class TPS05ClassificationSeq(nn.Module):
    def __init__(self):
        super(TPS05ClassificationSeq, self).__init__()
        num_feature = len(train.columns)-1
        num_class = 4
        self.linear = nn.Sequential(
            linear_block(num_feature, 100, 0.3),
            linear_block(100, 250, 0.3),
            linear_block(250, 128, 0.3),
        )
        
        self.out = nn.Sequential(
            nn.Linear(128, num_class)
        )
    
    def forward(self, x):
        x = self.linear(x)
        return self.out(x)

In [14]:
def model_creator(config):
    model = TPS05ClassificationSeq()
    return model

def optim_creator(model, config):
    return optim.Adam(model.parameters(), lr = 0.001)

criterion = nn.CrossEntropyLoss()

In [15]:
est = Estimator.from_torch(model=model_creator, optimizer=optim_creator, loss=criterion, metrics=[Accuracy()], backend="ray")

2022-06-30 17:05:22,072	INFO services.py:1340 -- View the Ray dashboard at [1m[32mhttp://10.239.166.126:8266[39m[22m


{'node_ip_address': '10.239.166.126', 'raylet_ip_address': '10.239.166.126', 'redis_address': '10.239.166.126:32362', 'object_store_address': '/tmp/ray/session_2022-06-30_17-05-19_623040_5167/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-06-30_17-05-19_623040_5167/sockets/raylet', 'webui_url': '10.239.166.126:8266', 'session_dir': '/tmp/ray/session_2022-06-30_17-05-19_623040_5167', 'metrics_export_port': 52740, 'node_id': 'bac8ce28b231b9cf338728270cb013999f32034ee852925c10b6c3d0'}


[2m[36m(pid=5959)[0m 
[2m[36m(pid=5959)[0m User settings:
[2m[36m(pid=5959)[0m 
[2m[36m(pid=5959)[0m    KMP_AFFINITY=granularity=fine,compact,1,0
[2m[36m(pid=5959)[0m    KMP_BLOCKTIME=0
[2m[36m(pid=5959)[0m    KMP_DUPLICATE_LIB_OK=True
[2m[36m(pid=5959)[0m    KMP_INIT_AT_FORK=FALSE
[2m[36m(pid=5959)[0m    KMP_SETTINGS=1
[2m[36m(pid=5959)[0m    OMP_NUM_THREADS=1
[2m[36m(pid=5959)[0m 
[2m[36m(pid=5959)[0m Effective settings:
[2m[36m(pid=5959)[0m 
[2m[36m(pid=5959)[0m    KMP_ABORT_DELAY=0
[2m[36m(pid=5959)[0m    KMP_ADAPTIVE_LOCK_PROPS='1,1024'
[2m[36m(pid=5959)[0m    KMP_ALIGN_ALLOC=64
[2m[36m(pid=5959)[0m    KMP_ALL_THREADPRIVATE=128
[2m[36m(pid=5959)[0m    KMP_ATOMIC_MODE=2
[2m[36m(pid=5959)[0m    KMP_BLOCKTIME=0
[2m[36m(pid=5959)[0m    KMP_CPUINFO_FILE: value is not defined
[2m[36m(pid=5959)[0m    KMP_DETERMINISTIC_REDUCTION=false
[2m[36m(pid=5959)[0m    KMP_DEVICE_THREAD_LIMIT=2147483647
[2m[36m(pid=5959)[0m    KMP_DIS

In [16]:
est.fit(data=shards_train1, feature_cols=['x_scaled'], label_cols=['y_scaled'], validation_data=shards_val1, 
        epochs=1, batch_size=BATCH_SIZE)

[2m[36m(LocalStore pid=5958)[0m 
[2m[36m(LocalStore pid=5958)[0m User settings:
[2m[36m(LocalStore pid=5958)[0m 
[2m[36m(LocalStore pid=5958)[0m    KMP_AFFINITY=granularity=fine,compact,1,0
[2m[36m(LocalStore pid=5958)[0m    KMP_BLOCKTIME=0
[2m[36m(LocalStore pid=5958)[0m    KMP_DUPLICATE_LIB_OK=True
[2m[36m(LocalStore pid=5958)[0m    KMP_INIT_AT_FORK=FALSE
[2m[36m(LocalStore pid=5958)[0m    KMP_SETTINGS=1
[2m[36m(LocalStore pid=5958)[0m    OMP_NUM_THREADS=1
[2m[36m(LocalStore pid=5958)[0m 
[2m[36m(LocalStore pid=5958)[0m Effective settings:
[2m[36m(LocalStore pid=5958)[0m 
[2m[36m(LocalStore pid=5958)[0m    KMP_ABORT_DELAY=0
[2m[36m(LocalStore pid=5958)[0m    KMP_ADAPTIVE_LOCK_PROPS='1,1024'
[2m[36m(LocalStore pid=5958)[0m    KMP_ALIGN_ALLOC=64
[2m[36m(LocalStore pid=5958)[0m    KMP_ALL_THREADPRIVATE=128
[2m[36m(LocalStore pid=5958)[0m    KMP_ATOMIC_MODE=2
[2m[36m(LocalStore pid=5958)[0m    KMP_BLOCKTIME=0
[2m[36m(LocalStore pid=

[2m[36m(PytorchRayWorker pid=5959)[0m Data size on worker:  79996


[2m[36m(PytorchRayWorker pid=5959)[0m [2022-06-30 17:10:28] INFO     Reducer buckets have been rebuilt in this iteration.


[2m[36m(PytorchRayWorker pid=5959)[0m Data size on worker:  20000


[2m[36m(PytorchRayWorker pid=5959)[0m [2022-06-30 17:10:34] INFO     Finished training epoch 1, stats on rank 0: {'epoch': 1, 'batch_count': 1250, 'num_samples': 79996, 'train_loss': 1.1223435290986714, 'last_train_loss': 1.1325827836990356, 'val_accuracy': tensor(0.5770), 'val_loss': 1.1085240882873535, 'val_num_samples': 20000}


[{'num_samples': 79996,
  'epoch': 1,
  'batch_count': 1250,
  'train_loss': 1.1223435290986714,
  'last_train_loss': 1.1325827836990356,
  'val_accuracy': tensor(0.5770),
  'val_loss': 1.1085240882873535,
  'val_num_samples': 20000}]

In [17]:
result = est.evaluate(data=shards_val1, feature_cols=['x_scaled'], label_cols=['y_scaled'], batch_size=1)

[2m[36m(LocalStore pid=5956)[0m 
[2m[36m(LocalStore pid=5956)[0m User settings:
[2m[36m(LocalStore pid=5956)[0m 
[2m[36m(LocalStore pid=5956)[0m    KMP_AFFINITY=granularity=fine,compact,1,0
[2m[36m(LocalStore pid=5956)[0m    KMP_BLOCKTIME=0
[2m[36m(LocalStore pid=5956)[0m    KMP_DUPLICATE_LIB_OK=True
[2m[36m(LocalStore pid=5956)[0m    KMP_INIT_AT_FORK=FALSE
[2m[36m(LocalStore pid=5956)[0m    KMP_SETTINGS=1
[2m[36m(LocalStore pid=5956)[0m    OMP_NUM_THREADS=1
[2m[36m(LocalStore pid=5956)[0m 
[2m[36m(LocalStore pid=5956)[0m Effective settings:
[2m[36m(LocalStore pid=5956)[0m 
[2m[36m(LocalStore pid=5956)[0m    KMP_ABORT_DELAY=0
[2m[36m(LocalStore pid=5956)[0m    KMP_ADAPTIVE_LOCK_PROPS='1,1024'
[2m[36m(LocalStore pid=5956)[0m    KMP_ALIGN_ALLOC=64
[2m[36m(LocalStore pid=5956)[0m    KMP_ALL_THREADPRIVATE=128
[2m[36m(LocalStore pid=5956)[0m    KMP_ATOMIC_MODE=2
[2m[36m(LocalStore pid=5956)[0m    KMP_BLOCKTIME=0
[2m[36m(LocalStore pid=

[2m[36m(PytorchRayWorker pid=5959)[0m Data size on worker:  20000


In [18]:
for r in result:
    print(r, ":", result[r])

num_samples : 20000
Accuracy : tensor(0.5770)
val_loss : 1.1085240890711545
