# Generating Model Card with PyTorch

This noteobok intends to provide an example of generating a model card for a PyTorch model using Intel Model Card Generator.

   1. [Data Collection and Prerpocessing from Adult Dataset](#1.-Data-Collection-and-Prerpocessing)
   2. [Build Multilayer Neural NetWork using PyTorch](#2.-Build-Model)
   3. [Train Model](#3.-Train-Model)
   4. [Collecting Inference Data](#4.-Run-Inference-and-Testing)
   5. [Generate Model Card with Intel Model Card Generator](#5.-Generate-Model-Card)

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from model_card_gen.model_card_gen import ModelCardGen

## 1. Data Collection and Prerpocessing

In [None]:
CATEGORICAL_FEATURE_KEYS = [
    'workclass',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
]

NUMERIC_FEATURE_KEYS = [
    'age',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'education-num'
]


DROP_COLUMNS = ['fnlwgt', 'education']

LABEL_KEY = 'label'

#### Fetch Data form OpenML

In [None]:
data = fetch_openml(data_id=1590, as_frame=True)
raw_data = data.data
raw_data['label'] = data.target
adult_data = raw_data.copy()

#### Drop Unneeded Columns

In [None]:
adult_data = adult_data.drop(DROP_COLUMNS, axis=1)
adult_data = pd.get_dummies(adult_data, columns=CATEGORICAL_FEATURE_KEYS)
adult_data['label'] = adult_data['label'].map({'<=50K': 0, '>50K': 1})

#### Train Test Split

In [None]:
# Convert features and labels to numpy arrays.
labels = adult_data['label'].to_numpy()
adult_data = adult_data.drop(['label'], axis=1)
feature_names = list(adult_data.columns)
data = adult_data.to_numpy()

# Separate training and test sets using 
train_indices = np.random.choice(len(labels), int(0.7 * len(labels)), replace=False)
test_indices = list(set(range(len(labels))) - set(train_indices))

train_features = data[train_indices]
train_labels = labels[train_indices]

test_features = data[test_indices]
test_labels = labels[test_indices]

train_df = raw_data.iloc[train_indices].copy()
test_df = raw_data.iloc[test_indices].copy()

## 2. Build Model

In [None]:
torch.manual_seed(1)  # Set seed for reproducibility.

class AdultNN(nn.Module):
    def __init__(self, feature_size, num_labels):
        super().__init__()
        self.linear1 = nn.Linear(feature_size, feature_size)
        self.sigmoid1 = nn.Sigmoid()
        self.linear2 = nn.Linear(feature_size, 8)
        self.sigmoid2 = nn.Sigmoid()
        self.linear3 = nn.Linear(8, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        lin1_out = self.linear1(x)
        sigmoid_out1 = self.sigmoid1(lin1_out)
        sigmoid_out2 = self.sigmoid2(self.linear2(sigmoid_out1))
        return self.softmax(self.linear3(sigmoid_out2))

## 3. Train Model

In [None]:
net = AdultNN(len(feature_names), 2)

criterion = nn.CrossEntropyLoss()
num_epochs = 200

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
input_tensor = torch.from_numpy(train_features).type(torch.FloatTensor)
label_tensor = torch.from_numpy(train_labels)
for epoch in range(num_epochs):    
    output = net(input_tensor)
    loss = criterion(output, label_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print ('Epoch {}/{} => Loss: {:.2f}'.format(epoch+1, num_epochs, loss.item()))

## 4. Run Inference and Testing

In [None]:
test_input_tensor = torch.from_numpy(test_features).type(torch.FloatTensor)
test_label_tensor = torch.from_numpy(test_labels)

y_probs_test = net(test_input_tensor).detach().numpy()
y_preds_test = np.argmax(y_probs_test, axis=1)

In [None]:
train_input_tensor = torch.from_numpy(train_features).type(torch.FloatTensor)
train_label_tensor = torch.from_numpy(train_labels)

y_probs_train = net(train_input_tensor).detach().numpy()
y_preds_train = np.argmax(y_probs_train, axis=1)

In [None]:
print("Train Accuracy:", sum(y_preds_train == train_labels) / len(train_labels))
print("Test Accuracy:", sum(y_preds_test == test_labels) / len(test_labels))

**Add Outputs to DataFram**

The predicted value for each datapoint needs to be added back into the  `train_df` and `test_df`, so that we can generate fairness metrics and graphs.

In [None]:
train_df['y_prob'] = [max(prob) for prob in y_probs_train]
train_df['y_pred'] = y_preds_train
train_df['y_true'] = train_labels

test_df['y_prob'] = [max(prob) for prob in y_probs_test]
test_df['y_pred'] = y_preds_test
test_df['y_true'] = test_labels

## 5. Generate Model Card

#### EvalConfig Input

In [None]:
_eval_config = 'eval_config.proto'

In [None]:
%%writefile {_eval_config}

model_specs {
    label_key: 'y_true'
    prediction_key: 'y_pred'
  }
metrics_specs {
    metrics {class_name: "BinaryAccuracy"}
    metrics {class_name: "AUC"}
    metrics {class_name: "ConfusionMatrixPlot"}
#     metrics {class_name: "ConfusionMatrixAtThresholds"}
    metrics {
      class_name: "FairnessIndicators"
#       config: '{"thresholds": [0.25, 0.5, 0.75]}'
    }
  }
slicing_specs {}
slicing_specs {
        feature_keys: 'sex'
#         feature_keys: 'race'
  }
options {
    include_default_metrics { value: false }
  }

In [None]:
mc = {
    "schema_version": "0.0.1",
    "model_details": {
        "name": "Adult Multilayer Neural Network",
        "version": {
            "name": "0.1",
            "date": "2022-08-01"
        },
        "graphics": {},

        "citations": [
             {
                "citation": 'Simoudis, Evangelos, Jiawei Han, and Usama Fayyad. Proceedings of the second international conference on knowledge discovery & data mining. No. CONF-960830-. AAAI Press, Menlo Park, CA (United States), 1996.'
             },
            {
                "citation": 'Friedler, Sorelle A., et al. "A Comparative Study of Fairness-Enhancing Interventions in Machine Learning." Proceedings of the Conference on Fairness, Accountability, and Transparency, 2019, https://doi.org/10.1145/3287560.3287589.'
            },
            {
                "citation": 'Lahoti, Preethi, et al. "Fairness without demographics through adversarially reweighted learning." Advances in neural information processing systems 33 (2020): 728-740.'
            }
        ],
        "overview": 'This example model card is for a multilayer network trained "Adult" dataset from the UCI repository with the learning task of predicting whether a person has a salary greater or less than $50,000.',
    }
}

In [None]:
ModelCardGen.generate(data_sets={'train': train_df, 'test': test_df}, eval_config=_eval_config, model_card=mc)