In [33]:
import pandas as pd
import re
import random
from tqdm import tqdm

path = './dataset/processed_inputs.csv'
df = pd.read_csv(path)

In [34]:
def preprocess(df_input, df_outputs):
    """
    Example Usage:
    ```
    df_input = pd.read_csv('dataset/inputs.csv')
    df_outputs = pd.read_csv('dataset/labels.csv')

    df = preprocess(df_input, df_outputs)
    ```
    """
    df = pd.merge(df_input, df_outputs, on='PatientID', how='inner')

    # Which columns to keep
    columns_to_keep = [
        'PatientID',
        'Sex',
        'HIVTesting',
        'ECigaretteUsage',
        'DifficultyConcentrating',
        'HadAsthma',
        'HadDepressiveDisorder',
        'CovidPos',
        'FluVaxLast12',
        'RaceEthnicityCategory',
        'HadDiabetes',
        'DifficultyDressingBathing',
        'ChestScan',
        'HadCOPD',
        'BlindOrVisionDifficulty',
        'HighRiskLastYear',
        'HadAngina',
        'PneumoVaxEver',
        'HadSkinCancer',
        'HadArthritis',
        'DeafOrHardOfHearing',
        'AlcoholDrinkers',
        'HadKidneyDisease',
        'TetanusLast10Tdap',
        'SmokerStatus',
        'HeightInMeters',
        'BMI',
        'HadHeartAttack'
    ]
    df = df[columns_to_keep]

    # Turn to bool
    columns_to_transform = [
        'DifficultyConcentrating',
        'HadAsthma',
        'HadDepressiveDisorder',
        'CovidPos',
        'FluVaxLast12',
        'DifficultyDressingBathing',
        'ChestScan',
        'HadCOPD',
        'BlindOrVisionDifficulty',
        'HighRiskLastYear',
        'HadAngina',
        'PneumoVaxEver',
        'HadSkinCancer',
        'HadArthritis',
        'DeafOrHardOfHearing',
        'AlcoholDrinkers',
        'HadKidneyDisease',
        'HadHeartAttack'
    ]
    df[columns_to_transform] = df[columns_to_transform].astype(bool)

    # Rounding
    df['BMI'] = df['BMI'].round(2)
    df['HeightInMeters'] = df['HeightInMeters'].round(2)

    ### Fix Column Names
    new_columns = ['Patient ID', 'Sex', 'HIV Testing', 'E-Cigarette Usage',
               'Difficulty Concentrating', 'Had Asthma', 'Had Depressive Disorder',
               'Covid Positive', 'Flu Vaccine Last 12 Months', 'Race/Ethnicity Category', 'Had Diabetes',
               'Difficulty Dressing/Bathing', 'Chest Scan', 'Had COPD',
               'Blind or Vision Difficulty', 'High Risk Last Year', 'Had Angina',
               'Pneumonia Vaccine Ever', 'Had Skin Cancer', 'Had Arthritis', 'Deaf or Hard of Hearing',
               'Alcohol Drinkers', 'Had Kidney Disease', 'Tetanus Last 10 Years (Tdap)',
               'Smoker Status', 'Height in Meters', 'BMI', 'Had Heart Attack']

    df.columns = new_columns

    return df

df_input =  pd.read_csv('./dataset/inputs.csv')
df_outputs = pd.read_csv('./dataset/labels.csv')
df = preprocess(df_input, df_outputs)

In [35]:
def format_string(row):
    # row is a row in dataframe
    feature1 = ", ".join([f"{col}: {row[col]}" for col in row.keys()])
    return feature1

def format_string_ALL(df):
    class_0 = df[df['Had Heart Attack'] == 0]
    class_1 = df[df['Had Heart Attack'] == 1]

    class_0 = class_0.sample(frac=1)
    class_1 = class_1.sample(frac=1)

    class_0 = class_0.drop(columns=["Patient ID"])
    class_1 = class_1.drop(columns=["Patient ID"])

    # class_0 and 1 are pandas df
    # Generate features for class 0 and class 1
    feature1 = [format_string(row) for _, row in class_0.sample(frac = 1).iterrows()]
    feature2 = [format_string(row) for _, row in class_1.sample(frac = 1).iterrows()]

    return feature1, feature2

In [36]:
feature0, feature1 = format_string_ALL(df)

train_index_0 = round(len(feature0)/10)
train_index_1 = round(len(feature1)/10)
test_0 = feature0[:train_index_0]
test_1 = feature1[:train_index_1]
train_0 = feature0[train_index_0:]
train_1 = feature1[train_index_1:]

In [37]:
from random import shuffle
from typing import List, Tuple

class BatchGenerator:
    def __init__(self, long_list: List, short_list: List, long_label: List, short_label: List):
        self.short_original = short_list
        self.long_original = long_list
        self.short_cycle = list(short_list)
        self.long_cycle = list(long_list)
        self.long_label = long_label
        self.short_label = short_label
        
    def get_batch(self, batch_size: int) -> List[Tuple]:
        batch = []
        
        for _ in range(batch_size):
            # Replenish and shuffle if needed
            if not self.short_cycle:
                self.short_cycle = list(self.short_original)
                shuffle(self.short_cycle)
                
            if not self.long_cycle:
                self.long_cycle = list(self.long_original)
                shuffle(self.long_cycle)
                
            # Get next items
            batch.append((self.long_cycle.pop(0), self.short_cycle.pop(0)))
        
        batch = list(zip(*batch))
        return list(batch[0]+batch[1]), self.long_label[:batch_size] + self.short_label[:batch_size]

generator = BatchGenerator(train_0, train_1, [0] * len(train_0), [1] * len(train_1))

## Transformer

In [38]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import InputExample, losses
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [39]:
model_Transformer = SentenceTransformer("multi-qa-mpnet-base-dot-v1").to(device)

In [54]:
data, label = generator.get_batch(32)

In [55]:
test = model_Transformer.encode(data, convert_to_tensor=True)

In [None]:
model_Transformer

In [56]:
test

tensor([[ 0.1405, -0.0765, -0.2450,  ..., -0.0922, -0.2806, -0.3450],
        [ 0.0800, -0.0923, -0.2394,  ..., -0.0761, -0.2638, -0.3621],
        [ 0.0796, -0.0980, -0.2346,  ..., -0.0757, -0.2676, -0.3707],
        ...,
        [ 0.1261, -0.0687, -0.2354,  ..., -0.0914, -0.2640, -0.3486],
        [ 0.1135, -0.0509, -0.2353,  ..., -0.1000, -0.2823, -0.3591],
        [ 0.1468, -0.0653, -0.2239,  ..., -0.1234, -0.2839, -0.3783]],
       device='cuda:0')

## Regression

In [59]:
linear_model(test)

tensor([[0.4878],
        [0.4917],
        [0.4931],
        [0.4929],
        [0.4920],
        [0.4863],
        [0.4902],
        [0.4958],
        [0.4871],
        [0.4940],
        [0.4903],
        [0.4929],
        [0.4896],
        [0.4925],
        [0.4918],
        [0.4930],
        [0.4897],
        [0.4908],
        [0.4944],
        [0.4949],
        [0.4914],
        [0.4910],
        [0.4910],
        [0.4865],
        [0.4912],
        [0.4868],
        [0.4935],
        [0.4945],
        [0.4847],
        [0.4844],
        [0.4912],
        [0.4912],
        [0.4895],
        [0.4880],
        [0.4892],
        [0.4869],
        [0.4922],
        [0.4910],
        [0.4907],
        [0.4842],
        [0.4928],
        [0.4908],
        [0.4880],
        [0.4901],
        [0.4878],
        [0.4910],
        [0.4926],
        [0.4915],
        [0.4909],
        [0.4846],
        [0.4915],
        [0.4884],
        [0.4871],
        [0.4892],
        [0.4899],
        [0

In [58]:
class Regression(nn.Module):
    def __init__(self):
        super(Regression, self).__init__()
        self.linear = nn.LazyLinear(1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        return self.sigmoid(self.linear(x))
linear_model = Regression().to(device)

In [None]:
for epoch in range(1):
    data, label = generator.get_batch(32)
    embedding = model_Transformer.encode(data)

criterion = nn.BCELoss()

## Scikit Logistic Regression Eval

In [25]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Split into training and testing sets
# features, labels = batch(1000)
# X = model_Transformer.encode(features)
# y = np.array(labels)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Split into training and testing sets
# features, labels = batch(2000)
# X_train, y_train = model_Transformer.encode(features), np.array(labels)  
# X_test,  y_test  = model_Transformer.encode()
# 
# 
# # Create and train the logistic regression model
# model = LogisticRegression(penalty=None)
# model.fit(X_train, y_train)
# 
# # Make predictions
# y_pred = model.predict(X_test)
# 
# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy:.2f}")
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

         0.0       0.73      0.72      0.73       198
         1.0       0.73      0.74      0.73       202

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
