<a href="https://colab.research.google.com/github/joshua-stock/fl-official-statistics/blob/main/med-insurance/fl-tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hands-on introduction to FL in OS

# Federated Insurance


## Tensorflow Federated Tutorials

**Getting started**

1. [Federated Learning for image classification](https://www.tensorflow.org/federated/tutorials/federated_learning_for_image_classification)
1. [Federated Learning for Text Generation](https://www.tensorflow.org/federated/tutorials/federated_learning_for_text_generation)
1. [Tuning recommended aggregations for learning](https://www.tensorflow.org/federated/tutorials/tuning_recommended_aggregators)
1. [Federated Reconstruction for Matrix Factorization](https://www.tensorflow.org/federated/tutorials/federated_reconstruction_for_matrix_factorization)

**... and  [more](https://www.tensorflow.org/federated/tutorials/tutorials_overview)**



## Setup

In [1]:
# Setup colab if needed

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

print("COLAB? {}".format(IN_COLAB))

if IN_COLAB:
    import os

    # rm repo from gdrive
    if os.path.exists("fl-official-statistics"):
      %rm -r fl-official-statistics

    # clone
    !git clone https://github.com/joshua-stock/fl-official-statistics
    %cd fl-official-statistics

    # pull (the currenct version of the repo)
    !git pull

    !pip install -q tensorflow-federated==0.56.0
    # or possibly !pip install -r requirements.txt

    os.chdir("med-insurance")

COLAB? True
Cloning into 'fl-official-statistics'...
remote: Enumerating objects: 304, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 304 (delta 17), reused 9 (delta 5), pack-reused 270[K
Receiving objects: 100% (304/304), 15.91 MiB | 16.29 MiB/s, done.
Resolving deltas: 100% (100/100), done.
/content/fl-official-statistics
Already up to date.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.6/142.6 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 kB[0m [31m10.0 MB/s

## Minimal example with wrappers

In [52]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Data and inputs
client_var = 'region'
target = 'charges'
df_raw  = pd.read_csv('data/insurance.csv')

# preprocessing
df = df_raw.copy()
df[['sex', 'smoker']] = OrdinalEncoder().fit_transform(
    df[['sex', 'smoker']].astype('category'))
df[['age', 'bmi', 'children']] = MinMaxScaler(
  ).fit_transform(df[['age', 'bmi', 'children']])

clients = df[client_var].unique()

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,0.021739,0.0,0.321227,0.0,1.0,southwest,16884.924
1,0.0,1.0,0.47915,0.2,0.0,southeast,1725.5523
2,0.217391,1.0,0.458434,0.6,0.0,southeast,4449.462
3,0.326087,1.0,0.181464,0.0,0.0,northwest,21984.47061
4,0.304348,1.0,0.347592,0.0,0.0,northwest,3866.8552


In [4]:
from FLutils import (
    create_keras_model,    # construct a deep neural network (keras)
    model_fn,              # convert keras model to tff.learning.models
    prep_fed_train,        # convert training data to tensors for learning with tensorflow
    prep_fed_test,         # convert test data to tensors for testing with tensorflow (other format than training data)
    train_fed              # train a keras model federated with distributed data
    )

In [32]:
def keras_blueprint(compile = False, nfeatures = None):
    if nfeatures == None: nfeatures = len(df.columns) - 2

    return create_keras_model(
        nfeatures = nfeatures,
        units = [40, 40, 20],
        activations = ['relu'] * 3,
        compile = compile)

In [55]:
train_data, test_data = train_test_split(
      df, test_size = 0.2, random_state = 42)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
560,0.608696,0.0,0.107345,0.4,0.0,northwest,9193.83850
1285,0.630435,0.0,0.224913,0.0,0.0,northeast,8534.67180
1142,0.739130,0.0,0.239440,0.0,0.0,southeast,27117.99378
969,0.456522,0.0,0.493947,1.0,0.0,southeast,8596.82780
486,0.782609,0.0,0.148238,0.6,0.0,northwest,12475.35130
...,...,...,...,...,...,...,...
1095,0.000000,0.0,0.414044,0.8,0.0,northeast,4561.18850
1130,0.456522,0.0,0.212806,1.0,0.0,southeast,8582.30230
1294,0.869565,1.0,0.247915,0.0,0.0,northeast,11931.12525
860,0.413043,0.0,0.851224,0.4,1.0,southwest,46113.51100


In [56]:
train_data_fed = []
test_data_fed = []



for client in clients:
  df_client = train_data[train_data[client_var] == client]
  df_client_train = df_client
  train_data_fed.append(
      prep_fed_train(
        df_client_train.loc[:,~ df_client_train.columns.isin([target, client_var])],
        df_client_train[target]
  ))
train_data_fed

[<_TensorSliceDataset element_spec=(TensorSpec(shape=(5,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>,
 <_TensorSliceDataset element_spec=(TensorSpec(shape=(5,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>,
 <_TensorSliceDataset element_spec=(TensorSpec(shape=(5,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>,
 <_TensorSliceDataset element_spec=(TensorSpec(shape=(5,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>]

In [57]:
# Training
result =  train_fed(
        model = model_fn(
            keras_creator = keras_blueprint,
            loss = tf.losses.MeanSquaredError()
        ),

        train_data = train_data_fed,

        NUM_ROUNDS = 5,
        NUM_EPOCHS = 5,

        client_optimizer = lambda: tf.optimizers.Adam(learning_rate = .05),
        server_optimizer = lambda: tf.optimizers.Adam(learning_rate = .05),

        BATCH_SIZE = 128,
        SHUFFLE_BUFFER = 20,
        PREFETCH_BUFFER = 5,
        SEED = 42,
        verbose = False
    )

In [59]:
# Evaluation
weights = result['process'].get_model_weights(result['state'])

model = keras_blueprint(compile = True)
weights.assign_weights_to(model)

perf_test = model.evaluate(test_data.loc[:,~test_data.columns.isin([target, client_var])].head(), test_data[target].head())
dict(zip(model.metrics_names, perf_test))




{'loss': 435178688.0,
 'mae': 17242.275390625,
 'mean_squared_error': 435178688.0,
 'r2_score': -2.147523880004883}