# Training machine learning models

## Loading training and validation datasets

In [1]:
from pathlib import Path

import pandas as pd

ROOT = Path("__file__").resolve().parent
DATA = ROOT / 'data'
TRAIN_AND_VAL_DS = DATA / 'US_2023_JUL_25_complete_cases_reviewd.csv'

train_and_val_ds = pd.read_csv(TRAIN_AND_VAL_DS)

prospective = train_and_val_ds['study'] == 'prospective'
birads_4a = train_and_val_ds['birads'] == '4a'
birads_4b = train_and_val_ds['birads'] == '4b'
mask = prospective & (birads_4a | birads_4b)

train_ds = train_and_val_ds[~mask]
val_ds = train_and_val_ds[mask]

In [2]:
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, ColumnTransformer

BEST_PARAMS = ROOT / 'bparams' / 'grid_search_dictionary_all_2023_5_24.pkl'

bp = pd.read_pickle(BEST_PARAMS)
svm_params = bp['SVM'][0][0]
clf = Pipeline([
    ('ct', ColumnTransformer([
        ('scaler', StandardScaler(), ['age', 'size']),
        ('encoder', OneHotEncoder(), ['margins']),
    ], remainder='passthrough')),
    ('svc', SVC(**svm_params)),
])

features = ['age', 'size', 'palpable', 'vessels', 'ir', 'shape', 'margins', 'orientation']

X_train = train_ds[features]
y_train = train_ds['result']

X_val = val_ds[features]
y_val = val_ds['result']
clf.fit(X_train, y_train)
clf.score(X_val, y_val)

0.8310344827586207