In [2]:
import pandas as pd
import re
import random
from tqdm import tqdm

path = './dataset/processed_inputs.csv'
df = pd.read_csv(path)

In [3]:
def preprocess(df_input, df_outputs):
    """
    Example Usage:
    ```
    df_input = pd.read_csv('dataset/inputs.csv')
    df_outputs = pd.read_csv('dataset/labels.csv')

    df = preprocess(df_input, df_outputs)
    ```
    """
    df = pd.merge(df_input, df_outputs, on='PatientID', how='inner')

    # Which columns to keep
    columns_to_keep = [
        'PatientID',
        'Sex',
        'HIVTesting',
        'ECigaretteUsage',
        'DifficultyConcentrating',
        'HadAsthma',
        'HadDepressiveDisorder',
        'CovidPos',
        'FluVaxLast12',
        'RaceEthnicityCategory',
        'HadDiabetes',
        'DifficultyDressingBathing',
        'ChestScan',
        'HadCOPD',
        'BlindOrVisionDifficulty',
        'HighRiskLastYear',
        'HadAngina',
        'PneumoVaxEver',
        'HadSkinCancer',
        'HadArthritis',
        'DeafOrHardOfHearing',
        'AlcoholDrinkers',
        'HadKidneyDisease',
        'TetanusLast10Tdap',
        'SmokerStatus',
        'HeightInMeters',
        'BMI',
        'HadHeartAttack'
    ]
    df = df[columns_to_keep]

    # Turn to bool
    columns_to_transform = [
        'DifficultyConcentrating',
        'HadAsthma',
        'HadDepressiveDisorder',
        'CovidPos',
        'FluVaxLast12',
        'DifficultyDressingBathing',
        'ChestScan',
        'HadCOPD',
        'BlindOrVisionDifficulty',
        'HighRiskLastYear',
        'HadAngina',
        'PneumoVaxEver',
        'HadSkinCancer',
        'HadArthritis',
        'DeafOrHardOfHearing',
        'AlcoholDrinkers',
        'HadKidneyDisease',
        'HadHeartAttack'
    ]
    df[columns_to_transform] = df[columns_to_transform].astype(bool)

    # Rounding
    df['BMI'] = df['BMI'].round(2)
    df['HeightInMeters'] = df['HeightInMeters'].round(2)

    ### Fix Column Names
    new_columns = ['Patient ID', 'Sex', 'HIV Testing', 'E-Cigarette Usage',
               'Difficulty Concentrating', 'Had Asthma', 'Had Depressive Disorder',
               'Covid Positive', 'Flu Vaccine Last 12 Months', 'Race/Ethnicity Category', 'Had Diabetes',
               'Difficulty Dressing/Bathing', 'Chest Scan', 'Had COPD',
               'Blind or Vision Difficulty', 'High Risk Last Year', 'Had Angina',
               'Pneumonia Vaccine Ever', 'Had Skin Cancer', 'Had Arthritis', 'Deaf or Hard of Hearing',
               'Alcohol Drinkers', 'Had Kidney Disease', 'Tetanus Last 10 Years (Tdap)',
               'Smoker Status', 'Height in Meters', 'BMI', 'Had Heart Attack']

    df.columns = new_columns

    return df

df_input =  pd.read_csv('./dataset/inputs.csv')
df_outputs = pd.read_csv('./dataset/labels.csv')
df = preprocess(df_input, df_outputs)

In [4]:
def format_string(row):
    # row is a row in dataframe
    feature1 = ", ".join([f"{col}: {row[col]}" for col in row.keys()])
    return feature1

def format_string_ALL(df):
    class_0 = df[df['Had Heart Attack'] == 0]
    class_1 = df[df['Had Heart Attack'] == 1]

    class_0 = class_0.sample(frac=1)
    class_1 = class_1.sample(frac=1)

    class_0 = class_0.drop(columns=["Patient ID"])
    class_1 = class_1.drop(columns=["Patient ID"])

    # class_0 and 1 are pandas df
    # Generate features for class 0 and class 1
    feature1 = [format_string(row) for _, row in class_0.sample(frac = 1).iterrows()]
    feature2 = [format_string(row) for _, row in class_1.sample(frac = 1).iterrows()]

    return feature1, feature2

In [None]:
feature0, feature1 = format_string_ALL(df)

train_index_0 = round(len(feature0)/10)
train_index_1 = round(len(feature1)/10)
test_0 = feature0[:train_index_0]
test_1 = feature1[:train_index_1]
train_0 = feature0[train_index_0:]
train_1 = feature1[train_index_1:]

In [34]:
len(test_0)

161627

In [26]:
tokenized = model_Transformer.tokenizer(train_0)

In [25]:
tokenized[0]

Encoding(num_tokens=203, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [23]:
model_Transformer.tokenizer

MPNetTokenizerFast(name_or_path='sentence-transformers/multi-qa-mpnet-base-dot-v1', vocab_size=30527, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	104: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30526: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=F

In [7]:
from torch.utils.data import TensorDataset, DataLoader

dataset_0 = TensorDataset([(x, y) for x, y in zip(train_0, [0]*len(train_0))])
loader_0 = DataLoader(dataset_0, batch_size=32, shuffle=True)

AttributeError: 'list' object has no attribute 'size'

## Transformer

In [15]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import InputExample, losses
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
model_Transformer = SentenceTransformer("multi-qa-mpnet-base-dot-v1").to(device)

## Scikit Logistic Regression Eval

In [25]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Split into training and testing sets
features, labels = batch(1000)
X = model_Transformer.encode(features)
y = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Split into training and testing sets
features, labels = batch(2000)
X_train, y_train = model_Transformer.encode(features), np.array(labels)  
X_test,  y_test  = model_Transformer.encode()


# Create and train the logistic regression model
model = LogisticRegression(penalty=None)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

         0.0       0.73      0.72      0.73       198
         1.0       0.73      0.74      0.73       202

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
