In [1]:
# FastAI
import fastai
from fastai.tabular.all import *

In [2]:
path = untar_data(URLs.ADULT_SAMPLE)

df = pd.read_csv(path / "adult.csv")
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [3]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df))
to = TabularPandas(
    df,
    procs=[Categorify, FillMissing, Normalize],
    cat_names=[
        "workclass",
        "education",
        "marital-status",
        "occupation",
        "relationship",
        "race",
    ],
    cont_names=["age", "fnlwgt", "education-num"],
    y_names="salary",
    y_block=CategoryBlock,  # required if doing classification and y_var is 0/1, not required if y is a category
    splits=splits,
)

In [4]:
to.xs.iloc[:2]

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
15382,5,16,5,2,2,5,1,-0.846986,0.185529,-0.035538
17475,5,16,1,2,5,5,1,-0.773817,-0.218193,-0.035538


In [5]:
dls = to.dataloaders(bs=64)

In [6]:
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,Some-college,Never-married,Other-service,Own-child,White,False,18.0,118376.000433,10.0,<50k
1,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,False,60.999999,85433.999062,9.0,<50k
2,Private,HS-grad,Married-civ-spouse,Sales,Husband,White,False,65.000001,193216.000006,9.0,>=50k
3,Private,Assoc-voc,Married-civ-spouse,Other-service,Husband,White,False,47.0,30456.995344,11.0,<50k
4,Private,Some-college,Never-married,Exec-managerial,Unmarried,Black,False,37.0,96329.999928,10.0,<50k
5,Self-emp-not-inc,HS-grad,Never-married,Transport-moving,Own-child,White,False,23.000001,191282.99996,9.0,<50k
6,Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,False,41.0,204046.000487,9.0,<50k
7,?,Some-college,Never-married,?,Own-child,White,False,19.000001,194095.000034,10.0,<50k
8,Self-emp-inc,Bachelors,Married-civ-spouse,Sales,Husband,White,False,53.0,148532.000496,13.0,>=50k
9,Private,HS-grad,Never-married,Craft-repair,Not-in-family,White,False,33.0,194901.000024,9.0,<50k


In [31]:
learn = tabular_learner(
    dls, metrics=accuracy
)  # NOTE: Need to use accuracy_multi if target is one-hot encoded

In [32]:
learn.fit_one_cycle(n_epoch=2)

epoch,train_loss,valid_loss,accuracy,time
0,0.60411,0.596877,0.658533,00:23
1,0.593578,0.583542,0.670897,00:22


In [13]:
learn.show_results()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary,salary_pred
0,7.0,12.0,3.0,9.0,6.0,3.0,1.0,1.56758,0.263781,-0.427131,0.0,0.0
1,5.0,10.0,5.0,0.0,2.0,5.0,1.0,-0.188468,0.637908,1.139242,0.0,0.0
2,2.0,16.0,3.0,9.0,1.0,3.0,1.0,-0.04213,-0.271082,-0.035538,0.0,0.0
3,3.0,16.0,5.0,12.0,2.0,3.0,1.0,0.616388,0.494901,-0.035538,0.0,0.0
4,3.0,15.0,3.0,11.0,1.0,5.0,1.0,0.689556,1.307823,1.922429,1.0,1.0
5,5.0,12.0,1.0,15.0,2.0,5.0,1.0,-1.212829,0.353907,-0.427131,0.0,0.0
6,5.0,16.0,3.0,2.0,1.0,5.0,1.0,-0.920155,-0.770985,-0.035538,1.0,0.0
7,5.0,12.0,1.0,2.0,5.0,5.0,1.0,0.104207,-0.458174,-0.427131,0.0,0.0
8,5.0,6.0,3.0,9.0,1.0,5.0,1.0,0.323713,0.011983,-2.385098,0.0,0.0


In [14]:
row, clas, probs = learn.predict(df.iloc[0])

In [15]:
row.show()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,Assoc-acdm,Married-civ-spouse,#na#,Wife,White,False,49.0,101320.002477,12.0,>=50k


In [16]:
clas, probs

(tensor(1), tensor([0.4389, 0.5611]))

In [17]:
test_df = df.copy()
test_df.drop(["salary"], axis=1, inplace=True)
dl = learn.dls.test_dl(test_df)

In [18]:
learn.get_preds(dl=dl)

(tensor([[0.4389, 0.5611],
         [0.4457, 0.5543],
         [0.9565, 0.0435],
         ...,
         [0.5212, 0.4788],
         [0.7326, 0.2674],
         [0.7528, 0.2472]]),
 None)

# Test Understanding

In [2]:
import os

df = pd.read_parquet("df_preprocessed.parquet")

In [3]:
df = df[
    [
        "presnap_offense_margin",
        "down",
        "distance",
        "yards_to_goal",
        "period",
        "clock_in_seconds",
        "pass",
    ]
]

In [4]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df))
to = TabularPandas(
    df,
    procs=[FillMissing, Normalize],
    cont_names=df.drop(columns="pass").columns.tolist(),
    y_names="pass",
    y_block=CategoryBlock,  # required if doing classification and y_var is 0/1, not required if y is a category
    splits=splits,
)

In [5]:
dls = to.dataloaders(bs=64)
dls.show_batch()

Unnamed: 0,presnap_offense_margin,down,distance,yards_to_goal,period,clock_in_seconds,pass
0,5.0,2.0,3.0,43.0,2.0,1815.0,1
1,-14.0,1.0,10.0,46.0,3.0,1799.999999,0
2,-14.0,1.0,10.0,55.0,3.0,1439.99999,0
3,-44.0,2.0,1.0,6.999999,4.0,797.000018,1
4,7.0,2.0,5.0,69.999999,3.0,1314.99999,0
5,-13.0,3.0,4.0,77.000001,2.0,1868.999999,0
6,-2.833267e-08,2.0,15.0,33.0,1.0,3275.000039,0
7,-7.0,1.0,10.0,75.000001,2.0,2088.999995,0
8,-2.833267e-08,2.0,4.0,69.0,1.0,3600.000059,1
9,-2.833267e-08,1.0,10.0,43.0,1.0,3524.999978,0


In [8]:
learn = tabular_learner(
    dls, metrics=accuracy
)  # NOTE: Need to use accuracy_multi if target is one-hot encoded
learn.fit_one_cycle(n_epoch=10)
learn.show_results()

epoch,train_loss,valid_loss,accuracy,time
0,0.61509,0.602558,0.654826,00:22
1,0.601534,0.593551,0.659841,00:21
2,0.588619,0.588796,0.665642,00:23
3,0.596938,0.584644,0.668433,00:22
4,0.59752,0.582504,0.66357,00:22
5,0.589357,0.580073,0.670505,00:23
6,0.589726,0.580668,0.669218,00:23
7,0.588735,0.578636,0.671617,00:23
8,0.587666,0.578489,0.671748,00:22
9,0.589289,0.577493,0.672118,00:22


Unnamed: 0,presnap_offense_margin,down,distance,yards_to_goal,period,clock_in_seconds,pass,pass_pred
0,-0.724125,0.002418,0.109534,1.518912,0.465311,-0.516079,1.0,0.0
1,-0.081805,0.990326,-0.381461,1.229155,-0.435987,0.136934,1.0,1.0
2,1.010141,-0.98549,0.355031,0.980791,-0.435987,0.341121,0.0,0.0
3,-0.531429,-0.98549,0.355031,0.856609,1.366608,-1.496562,0.0,0.0
4,1.90939,0.002418,-1.608947,0.649639,0.465311,-0.191499,1.0,0.0
5,-0.017572,-0.98549,0.355031,0.442669,1.366608,-1.236512,0.0,0.0
6,0.110892,1.978234,-1.117952,0.525457,1.366608,-1.288522,1.0,0.0
7,0.36782,1.978234,0.600529,1.684488,1.366608,-1.327048,0.0,0.0
8,-0.274501,-0.98549,0.355031,1.063579,-1.337285,1.460298,1.0,0.0


In [40]:
row, clas, probs = learn.predict(df.iloc[0])
row.show()
clas, probs
test_df = df.copy()
test_df.drop(["pass"], axis=1, inplace=True)
dl = learn.dls.test_dl(test_df)
learn.get_preds(dl=dl)

Unnamed: 0,presnap_offense_margin,down,distance,yards_to_goal,period,clock_in_seconds,pass
0,5.077935e-08,1.0,10.0,74.999999,1.0,3595.000059,0


(tensor([[0.5561, 0.4439],
         [0.4903, 0.5097],
         [0.1692, 0.8308],
         ...,
         [0.1439, 0.8561],
         [0.2929, 0.7071],
         [0.9294, 0.0706]]),
 None)