In [1]:
import re, sys, os
import logging
from pathlib import Path
from datetime import datetime
import torch
import torch_directml
from fastai.tabular.all import *
from fastai.data.external import *

torch._logging.set_logs(all=logging.WARNING)

In [2]:
print("="*80)
print("DIRECTML FULL DEBUG LOG STARTED")
print(f"torch version: {torch_directml.torch.__version__}")
print(f"Device: {torch_directml.device()}")
print(f"GPU: {torch_directml.device_name(0)}")
print("="*80)

DIRECTML FULL DEBUG LOG STARTED
torch version: 2.4.1+cpu
Device: privateuseone:0
GPU: Radeon RX 560X 


In [3]:
# 1. Detect DirectML device (no global default!)
dml = torch_directml.device()
print("="*80)

# 2. Patch Normalize to handle device properly
old_normalize_setups = Normalize.setups

def new_normalize_setups(self, to):
    old_normalize_setups(self, to)
    # Move computed mean/std to device after they're calculated
    if hasattr(self, 'mean') and self.mean is not None:
        self.mean = self.mean.to(dml)
    if hasattr(self, 'std') and self.std is not None:
        self.std = self.std.to(dml)

Normalize.setups = new_normalize_setups

# 3. Patch FillMissing to work on GPU
old_fillmissing_setups = FillMissing.setups

def new_fillmissing_setups(self, to):
    old_fillmissing_setups(self, to)
    # Move computed medians to device
    if hasattr(self, 'na_dict') and self.na_dict:
        for k, v in self.na_dict.items():
            if isinstance(v, torch.Tensor):
                self.na_dict[k] = v.to(dml)

FillMissing.setups = new_fillmissing_setups



In [5]:
path = untar_data(URLs.ADULT_SAMPLE)

In [6]:
dls = TabularDataLoaders.from_csv(
    path/'adult.csv', path=path, y_names="salary",
    cat_names = ['workclass', 'education', 'marital-status', 'occupation',
                 'relationship', 'race'],
    cont_names = ['age', 'fnlwgt', 'education-num'],
    procs = [Categorify, FillMissing, Normalize], device=dml, bs=64,
    verbose=True, num_workers=0, pin_memory=False, persistent_workers=False
)

Setting up after_item: Pipeline: 
Setting up before_batch: Pipeline: 
Setting up after_batch: Pipeline: ReadTabBatch


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)


In [7]:
dls.to(dml)
dls.to_device(dml)
""

''

In [8]:
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,Some-college,Married-civ-spouse,Adm-clerical,Wife,White,False,36.0,461336.989685,10.0,<50k
1,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,False,34.0,94412.997911,9.0,>=50k
2,Self-emp-not-inc,Assoc-voc,Married-civ-spouse,Farming-fishing,Husband,White,False,31.0,226695.998947,11.0,>=50k
3,Private,Some-college,Never-married,Other-service,Own-child,White,False,21.0,163595.000712,10.0,<50k
4,?,5th-6th,Married-civ-spouse,?,Husband,White,False,60.0,131852.001391,3.0,>=50k
5,Private,HS-grad,Never-married,Transport-moving,Own-child,White,False,21.0,306850.000791,9.0,<50k
6,Private,Assoc-acdm,Divorced,Exec-managerial,Not-in-family,White,False,34.0,64830.000177,12.0,<50k
7,Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,False,28.0,197905.000123,9.0,<50k
8,Private,HS-grad,Never-married,Other-service,Not-in-family,White,False,21.0,57710.998879,9.0,<50k
9,Private,10th,Divorced,Prof-specialty,Unmarried,White,False,62.0,91432.999408,6.0,<50k


In [9]:
learn = tabular_learner(dls, metrics=accuracy)
""

''

In [10]:
learn.model.to(dml)
learn.to(dml)
learn = learn.to_fp16(enabled=False)
""

''

In [11]:
# import cProfile
# import pstats
# from io import StringIO

# # Profile the training
# profiler = cProfile.Profile()
# profiler.enable()
# learn.fit_one_cycle(1)
# profiler.disable()

# # Get top bottlenecks
# s = StringIO()
# ps = pstats.Stats(profiler, stream=s).sort_stats('cumulative')
# ps.print_stats(30)  # Top 30 functions
# print(s.getvalue())

In [12]:
# learn.fit_one_cycle instead of learn.fine_tune
# because we do not have a pretrained model for tabular data
learn.fit_one_cycle(4)



epoch,train_loss,valid_loss,accuracy,time
0,0.387549,0.38289,0.822942,00:37
1,0.361545,0.382094,0.826014,00:37
2,0.356888,0.363701,0.830774,00:36
3,0.340325,0.362943,0.834613,00:36


In [13]:
learn.show_results()



Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary,salary_pred
0,5.0,12.0,3.0,13.0,1.0,2.0,1.0,-0.261803,0.746459,-0.423871,1.0,0.0
1,5.0,8.0,3.0,8.0,1.0,5.0,1.0,-0.188387,-0.817865,0.747136,0.0,0.0
2,5.0,10.0,3.0,5.0,1.0,5.0,1.0,-0.482052,1.229239,1.137472,1.0,1.0
3,5.0,12.0,5.0,4.0,4.0,5.0,1.0,-0.482052,0.037411,-0.423871,0.0,0.0
4,5.0,2.0,5.0,9.0,2.0,3.0,1.0,0.912858,1.072362,-1.204543,0.0,0.0
5,5.0,12.0,5.0,4.0,4.0,5.0,1.0,-0.628885,0.134412,-0.423871,0.0,0.0
6,5.0,13.0,5.0,11.0,4.0,5.0,1.0,-0.261803,-0.6922,1.527808,1.0,0.0
7,1.0,12.0,5.0,1.0,5.0,3.0,1.0,-1.289631,0.548798,-0.423871,0.0,0.0
8,1.0,12.0,7.0,1.0,2.0,5.0,1.0,3.776093,-1.421555,-0.423871,0.0,0.0
