In [1]:
# We have to prepare for this journey .... import modules is e great idea .... :)
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from bigdl.orca import init_orca_context, OrcaContext
from bigdl.orca.learn.pytorch import Estimator 
from bigdl.orca.learn.metrics import Accuracy

import bigdl.orca.data
import bigdl.orca.data.pandas
from bigdl.orca.data import SharedValue
from bigdl.orca.data import SparkXShards

from bigdl.orca.data.transformer import *

import ray

In [2]:
# cluster_mode can be "local", "k8s" or "yarn"
sc = init_orca_context(cluster_mode="local", cores=4, memory="10g", num_nodes=1) 

Initializing orca context
Current pyspark location is : /home/yansu/miniconda3/envs/env/lib/python3.7/site-packages/pyspark/__init__.py
Start to getOrCreate SparkContext
pyspark_submit_args is:  --driver-class-path /home/yansu/Desktop/yxy/BigDL/dist/lib/bigdl-dllib-spark_2.4.6-2.1.0-SNAPSHOT-jar-with-dependencies.jar:/home/yansu/Desktop/yxy/BigDL/dist/lib/bigdl-friesian-spark_2.4.6-2.1.0-SNAPSHOT-jar-with-dependencies.jar:/home/yansu/Desktop/yxy/BigDL/dist/lib/bigdl-orca-spark_2.4.6-2.1.0-SNAPSHOT-jar-with-dependencies.jar pyspark-shell 
Successfully got a SparkContext


### Load data

In [3]:
file_path = './train.csv'
data_shard = bigdl.orca.data.pandas.read_csv(file_path)

### Duplicate the dataframe

In [4]:
data_shard = data_shard.deduplicates()

### Labelencode y

In [5]:
def trans_func(df):
    df = df.rename(columns={'id':'id0'})
    return df
data_shard = data_shard.transform_shard(trans_func)

In [6]:
scale = StringIndexer(inputCol='target')
transformed_data_shard = scale.fit_transform(data_shard)

In [7]:
def trans_func(df):
    df['target'] = df['target']-1
    return df
transformed_data_shard = transformed_data_shard.transform_shard(trans_func)

### Split train and test set

In [8]:
RANDOM_STATE = 2021
def split_train_test(data):
    train, test = train_test_split(data, test_size=0.2, random_state=RANDOM_STATE)
    return train, test

shards_train, shards_val = transformed_data_shard.transform_shard(split_train_test).split()

### Transform the feature columns

In [9]:
feature_list = []
for i in range(50):
    feature_list.append('feature_' + str(i))
scale = MinMaxScaler(inputCol=feature_list, outputCol="x_scaled")
shards_train = scale.fit_transform(shards_train)
shards_val = scale.transform(shards_val)

### Change data types

In [10]:
def trans_func(df):
    df['x_scaled'] = df['x_scaled'].apply(lambda x:np.array(x,dtype=np.float32))
    df['target'] = df['target'].apply(lambda x:np.long(x))
    return df
shards_train1 = shards_train.transform_shard(trans_func)
shards_val1 = shards_val.transform_shard(trans_func)

### Model

In [11]:
torch.manual_seed(0)
BATCH_SIZE = 64
NUM_CLASSES = 4
NUM_EPOCHS = 100
NUM_FEATURES = 50

In [12]:
def linear_block(in_features, out_features, p_drop, *args, **kwargs):
    return nn.Sequential(
        nn.Linear(in_features, out_features),
        nn.ReLU(),
        nn.Dropout(p = p_drop)
    )

class TPS05ClassificationSeq(nn.Module):
    def __init__(self):
        super(TPS05ClassificationSeq, self).__init__()
        num_feature = NUM_FEATURES
        num_class = 4
        self.linear = nn.Sequential(
            linear_block(num_feature, 100, 0.3),
            linear_block(100, 250, 0.3),
            linear_block(250, 128, 0.3),
        )
        
        self.out = nn.Sequential(
            nn.Linear(128, num_class)
        )
    
    def forward(self, x):
        x = self.linear(x)
        return self.out(x)

In [13]:
def model_creator(config):
    model = TPS05ClassificationSeq()
    return model

def optim_creator(model, config):
    return optim.Adam(model.parameters(), lr = 0.001)

criterion = nn.CrossEntropyLoss()

In [14]:
est = Estimator.from_torch(model=model_creator, optimizer=optim_creator, loss=criterion, metrics=[Accuracy()], backend="ray")

2022-08-19 13:31:14,720	INFO services.py:1340 -- View the Ray dashboard at [1m[32mhttp://10.239.44.149:8265[39m[22m


{'node_ip_address': '10.239.44.149', 'raylet_ip_address': '10.239.44.149', 'redis_address': '10.239.44.149:6379', 'object_store_address': '/tmp/ray/session_2022-08-19_13-31-12_406382_1621/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-08-19_13-31-12_406382_1621/sockets/raylet', 'webui_url': '10.239.44.149:8265', 'session_dir': '/tmp/ray/session_2022-08-19_13-31-12_406382_1621', 'metrics_export_port': 44343, 'node_id': 'ae49824ef94e8edfe1c96c79a4323856efbd0b302a7e4e8a6bcddd83'}


[2m[36m(pid=15522)[0m 
[2m[36m(pid=15522)[0m User settings:
[2m[36m(pid=15522)[0m 
[2m[36m(pid=15522)[0m    KMP_AFFINITY=granularity=fine,compact,1,0
[2m[36m(pid=15522)[0m    KMP_BLOCKTIME=0
[2m[36m(pid=15522)[0m    KMP_DUPLICATE_LIB_OK=True
[2m[36m(pid=15522)[0m    KMP_INIT_AT_FORK=FALSE
[2m[36m(pid=15522)[0m    KMP_SETTINGS=1
[2m[36m(pid=15522)[0m    OMP_NUM_THREADS=1
[2m[36m(pid=15522)[0m 
[2m[36m(pid=15522)[0m Effective settings:
[2m[36m(pid=15522)[0m 
[2m[36m(pid=15522)[0m    KMP_ABORT_DELAY=0
[2m[36m(pid=15522)[0m    KMP_ADAPTIVE_LOCK_PROPS='1,1024'
[2m[36m(pid=15522)[0m    KMP_ALIGN_ALLOC=64
[2m[36m(pid=15522)[0m    KMP_ALL_THREADPRIVATE=128
[2m[36m(pid=15522)[0m    KMP_ATOMIC_MODE=2
[2m[36m(pid=15522)[0m    KMP_BLOCKTIME=0
[2m[36m(pid=15522)[0m    KMP_CPUINFO_FILE: value is not defined
[2m[36m(pid=15522)[0m    KMP_DETERMINISTIC_REDUCTION=false
[2m[36m(pid=15522)[0m    KMP_DEVICE_THREAD_LIMIT=2147483647
[2m[36m(pid

In [15]:
est.fit(data=shards_train1, feature_cols=['x_scaled'], label_cols=['target'], validation_data=shards_val1, epochs=1, batch_size=BATCH_SIZE)

[2m[36m(PytorchRayWorker pid=15522)[0m Data size on worker:  79920
[2m[36m(PytorchRayWorker pid=15522)[0m Data size on worker:  20080


[2m[36m(PytorchRayWorker pid=15522)[0m   return default_collate([torch.as_tensor(b) for b in batch])
[2m[36m(PytorchRayWorker pid=15522)[0m   allow_unreachable=True)  # allow_unreachable flag
[2m[36m(PytorchRayWorker pid=15522)[0m [2022-08-19 13:31:33] INFO     Reducer buckets have been rebuilt in this iteration.
[2m[36m(PytorchRayWorker pid=15522)[0m   def resize(img, size, interpolation=Image.BILINEAR):
[2m[36m(PytorchRayWorker pid=15522)[0m   def perspective(img, perspective_coeffs, interpolation=Image.BICUBIC, fill=None):
[2m[36m(PytorchRayWorker pid=15522)[0m   def resize(img: Tensor, size: List[int], interpolation: int = Image.BILINEAR) -> Tensor:
[2m[36m(PytorchRayWorker pid=15522)[0m   Image.NEAREST: 'PIL.Image.NEAREST',
[2m[36m(PytorchRayWorker pid=15522)[0m   Image.BILINEAR: 'PIL.Image.BILINEAR',
[2m[36m(PytorchRayWorker pid=15522)[0m   Image.BICUBIC: 'PIL.Image.BICUBIC',
[2m[36m(PytorchRayWorker pid=15522)[0m   Image.LANCZOS: 'PIL.Image.LANCZOS'

[{'num_samples': 79920,
  'epoch': 1,
  'batch_count': 1249,
  'train_loss': 1.119587504183566,
  'last_train_loss': 1.1001675128936768,
  'val_accuracy': tensor(0.5718),
  'val_loss': 1.1131852335663905,
  'val_num_samples': 20080}]

In [16]:
result = est.evaluate(data=shards_val1, feature_cols=['x_scaled'], label_cols=['target'], batch_size=1)

[2m[36m(PytorchRayWorker pid=15522)[0m Data size on worker:  20080


In [17]:
for r in result:
    print(r, ":", result[r])

num_samples : 20080
Accuracy : tensor(0.5718)
val_loss : 1.1131852300770848
