In [1]:
import pandas as pd
import re
import random
from tqdm import tqdm

path = './dataset/processed_inputs.csv'
df = pd.read_csv(path)

In [2]:
def preprocess(df_input, df_outputs):
    """
    Example Usage:
    ```
    df_input = pd.read_csv('dataset/inputs.csv')
    df_outputs = pd.read_csv('dataset/labels.csv')

    df = preprocess(df_input, df_outputs)
    ```
    """
    df = pd.merge(df_input, df_outputs, on='PatientID', how='inner')

    # Which columns to keep
    columns_to_keep = [
        'PatientID',
        'Sex',
        'HIVTesting',
        'ECigaretteUsage',
        'DifficultyConcentrating',
        'HadAsthma',
        'HadDepressiveDisorder',
        'CovidPos',
        'FluVaxLast12',
        'RaceEthnicityCategory',
        'HadDiabetes',
        'DifficultyDressingBathing',
        'ChestScan',
        'HadCOPD',
        'BlindOrVisionDifficulty',
        'HighRiskLastYear',
        'HadAngina',
        'PneumoVaxEver',
        'HadSkinCancer',
        'HadArthritis',
        'DeafOrHardOfHearing',
        'AlcoholDrinkers',
        'HadKidneyDisease',
        'TetanusLast10Tdap',
        'SmokerStatus',
        'HeightInMeters',
        'BMI',
        'HadHeartAttack'
    ]
    df = df[columns_to_keep]

    # Turn to bool
    columns_to_transform = [
        'DifficultyConcentrating',
        'HadAsthma',
        'HadDepressiveDisorder',
        'CovidPos',
        'FluVaxLast12',
        'DifficultyDressingBathing',
        'ChestScan',
        'HadCOPD',
        'BlindOrVisionDifficulty',
        'HighRiskLastYear',
        'HadAngina',
        'PneumoVaxEver',
        'HadSkinCancer',
        'HadArthritis',
        'DeafOrHardOfHearing',
        'AlcoholDrinkers',
        'HadKidneyDisease',
        'HadHeartAttack'
    ]
    df[columns_to_transform] = df[columns_to_transform].astype(bool)

    # Rounding
    df['BMI'] = df['BMI'].round(2)
    df['HeightInMeters'] = df['HeightInMeters'].round(2)

    ### Fix Column Names
    new_columns = ['Patient ID', 'Sex', 'HIV Testing', 'E-Cigarette Usage',
               'Difficulty Concentrating', 'Had Asthma', 'Had Depressive Disorder',
               'Covid Positive', 'Flu Vaccine Last 12 Months', 'Race/Ethnicity Category', 'Had Diabetes',
               'Difficulty Dressing/Bathing', 'Chest Scan', 'Had COPD',
               'Blind or Vision Difficulty', 'High Risk Last Year', 'Had Angina',
               'Pneumonia Vaccine Ever', 'Had Skin Cancer', 'Had Arthritis', 'Deaf or Hard of Hearing',
               'Alcohol Drinkers', 'Had Kidney Disease', 'Tetanus Last 10 Years (Tdap)',
               'Smoker Status', 'Height in Meters', 'BMI', 'Had Heart Attack']

    df.columns = new_columns

    return df

df_input =  pd.read_csv('./dataset/inputs.csv')
df_outputs = pd.read_csv('./dataset/labels.csv')
df = preprocess(df_input, df_outputs)

In [3]:
def format_string(row):
    # row is a row in dataframe
    feature1 = ", ".join([f"{col}: {row[col]}" for col in row.keys()])
    return feature1

def format_string_ALL(df):
    class_0 = df[df['Had Heart Attack'] == 0]
    class_1 = df[df['Had Heart Attack'] == 1]

    class_0 = class_0.sample(frac=1)
    class_1 = class_1.sample(frac=1)

    class_0 = class_0.drop(columns=["Patient ID"])
    class_1 = class_1.drop(columns=["Patient ID"])

    # class_0 and 1 are pandas df
    # Generate features for class 0 and class 1
    feature1 = [format_string(row) for _, row in class_0.sample(frac = 1).iterrows()]
    feature2 = [format_string(row) for _, row in class_1.sample(frac = 1).iterrows()]

    return feature1, feature2

In [None]:
feature0, feature1 = format_string_ALL(df)

train_index_0 = round(len(feature0)/5)
train_index_1 = round(len(feature1)/5)
test_0 = feature0[:train_index_0]
test_1 = feature1[:train_index_1]
train_0 = feature0[train_index_0:]
train_1 = feature1[train_index_1:]

In [5]:
from random import shuffle
from typing import List, Tuple

class BatchGenerator:
    def __init__(self, long_list: List, short_list: List, long_label: List, short_label: List):
        self.short_original = short_list
        self.long_original = long_list
        self.short_cycle = list(short_list)
        self.long_cycle = list(long_list)
        self.long_label = long_label
        self.short_label = short_label
        
    def get_batch(self, batch_size: int) -> List[Tuple]:
        batch = []
        
        for _ in range(batch_size):
            # Replenish and shuffle if needed
            if not self.short_cycle:
                self.short_cycle = list(self.short_original)
                shuffle(self.short_cycle)
                
            if not self.long_cycle:
                self.long_cycle = list(self.long_original)
                shuffle(self.long_cycle)
                
            # Get next items
            batch.append((self.long_cycle.pop(0), self.short_cycle.pop(0)))
        
        batch = list(zip(*batch))
        return list(batch[0]+batch[1]), self.long_label[:batch_size] + self.short_label[:batch_size]

generator = BatchGenerator(train_0, train_1, [0] * len(train_0), [1] * len(train_1))

## Transformer

In [6]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import InputExample, losses
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model_Transformer = SentenceTransformer('multi-qa-mpnet-base-dot-v1').to(device)

## Regression

In [8]:
class Regression(nn.Module):
    def __init__(self):
        super(Regression, self).__init__()
        self.linear = nn.LazyLinear(1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        return self.sigmoid(self.linear(x))
linear_model = Regression().to(device)

In [19]:
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

criterion = nn.BCELoss()
optimizer_regression = torch.optim.SGD(linear_model.parameters(), lr=0.05)
optimizer_sentence = torch.optim.Adam(model_Transformer.parameters(), lr=0.0001)

for epoch in range(100):
    data, label = generator.get_batch(32)
    label = torch.tensor(label).to(device).unsqueeze(1).type(torch.float32)
    
    tokenized = model_Transformer.tokenize(data)
    tokenized['input_ids'] = tokenized['input_ids'].to(device)
    tokenized['attention_mask'] = tokenized['attention_mask'].to(device)

    embedding = model_Transformer(tokenized)['sentence_embedding']

    pred = linear_model(embedding)

    loss = criterion(pred,label)
    print(f'loss is {loss.item()}')

    optimizer_regression.zero_grad()
    optimizer_sentence.zero_grad()
    loss.backward()
    optimizer_regression.step()
    optimizer_sentence.step()

    if epoch % 50 == 0:
        with torch.no_grad():
            embedding = torch.tensor(model_Transformer.encode(test_0 + test_1)).to(device)
            pred = linear_model(embedding).detach().cpu().numpy()
            pred = np.round(pred)
            precision, recall, f1_score, _ = precision_recall_fscore_support([0]*len(test_0) + [1]*len(test_1), pred, average='weighted')
            print(f'precision is {precision}, recall is {recall}, f1 is {f1_score}')

loss is 0.7388517260551453


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precision is 0.8919146038642977, recall is 0.9444123060741519, f1 is 0.9174130415427268
loss is 0.7675619125366211
loss is 0.7199819087982178
loss is 0.6950812339782715
loss is 0.6932196617126465
loss is 0.6931595802307129
loss is 0.6931620240211487
loss is 0.693181037902832
loss is 0.6931676864624023
loss is 0.6931626796722412
loss is 0.6931648254394531
loss is 0.6931618452072144
loss is 0.6931555271148682
loss is 0.6931520700454712
loss is 0.6931489109992981
loss is 0.6931471824645996
loss is 0.6931443810462952
loss is 0.6931443810462952
loss is 0.6931434273719788
loss is 0.6931424736976624
loss is 0.6931437253952026
loss is 0.6931424140930176
loss is 0.6931411027908325
loss is 0.6931405067443848
loss is 0.693141758441925
loss is 0.6931397318840027
loss is 0.6931393146514893
loss is 0.6931390166282654
loss is 0.6931376457214355
loss is 0.6931350827217102
loss is 0.693134069442749
loss is 0.6931362152099609
loss is 0.6931333541870117
loss is 0.6931278109550476
loss is 0.69313001632690

In [20]:
with torch.no_grad():
    embedding = torch.tensor(model_Transformer.encode(test_0 + test_1)).to(device)
    pred = linear_model(embedding).detach().cpu().numpy()
    pred = np.round(pred)
    precision, recall, f1_score, _ = precision_recall_fscore_support([0]*len(test_0) + [1]*len(test_1), pred, average='weighted')
    print(f'precision is {precision}, recall is {recall}, f1 is {f1_score}')

precision is 1.0, recall is 1.0, f1 is 1.0


## Scikit Logistic Regression Eval

In [25]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Split into training and testing sets
# features, labels = batch(1000)
# X = model_Transformer.encode(features)
# y = np.array(labels)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Split into training and testing sets
# features, labels = batch(2000)
# X_train, y_train = model_Transformer.encode(features), np.array(labels)  
# X_test,  y_test  = model_Transformer.encode()
# 
# 
# # Create and train the logistic regression model
# model = LogisticRegression(penalty=None)
# model.fit(X_train, y_train)
# 
# # Make predictions
# y_pred = model.predict(X_test)
# 
# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy:.2f}")
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

         0.0       0.73      0.72      0.73       198
         1.0       0.73      0.74      0.73       202

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
