In [1]:
import pandas as pd
import re
import random
from tqdm import tqdm

path = './dataset/processed_inputs.csv'
df = pd.read_csv(path)

In [2]:
def preprocess(df_input, df_outputs):
    """
    Example Usage:
    ```
    df_input = pd.read_csv('dataset/inputs.csv')
    df_outputs = pd.read_csv('dataset/labels.csv')

    df = preprocess(df_input, df_outputs)
    ```
    """
    df = pd.merge(df_input, df_outputs, on='PatientID', how='inner')

    # Which columns to keep
    columns_to_keep = [
        'PatientID',
        'Sex',
        'HIVTesting',
        'ECigaretteUsage',
        'DifficultyConcentrating',
        'HadAsthma',
        'HadDepressiveDisorder',
        'CovidPos',
        'FluVaxLast12',
        'RaceEthnicityCategory',
        'HadDiabetes',
        'DifficultyDressingBathing',
        'ChestScan',
        'HadCOPD',
        'BlindOrVisionDifficulty',
        'HighRiskLastYear',
        'HadAngina',
        'PneumoVaxEver',
        'HadSkinCancer',
        'HadArthritis',
        'DeafOrHardOfHearing',
        'AlcoholDrinkers',
        'HadKidneyDisease',
        'TetanusLast10Tdap',
        'SmokerStatus',
        'HeightInMeters',
        'BMI',
        'HadHeartAttack'
    ]
    df = df[columns_to_keep]

    # Turn to bool
    columns_to_transform = [
        'DifficultyConcentrating',
        'HadAsthma',
        'HadDepressiveDisorder',
        'CovidPos',
        'FluVaxLast12',
        'DifficultyDressingBathing',
        'ChestScan',
        'HadCOPD',
        'BlindOrVisionDifficulty',
        'HighRiskLastYear',
        'HadAngina',
        'PneumoVaxEver',
        'HadSkinCancer',
        'HadArthritis',
        'DeafOrHardOfHearing',
        'AlcoholDrinkers',
        'HadKidneyDisease',
        'HadHeartAttack'
    ]
    df[columns_to_transform] = df[columns_to_transform].astype(bool)

    # Rounding
    df['BMI'] = df['BMI'].round(2)
    df['HeightInMeters'] = df['HeightInMeters'].round(2)

    ### Fix Column Names
    new_columns = ['Patient ID', 'Sex', 'HIV Testing', 'E-Cigarette Usage',
               'Difficulty Concentrating', 'Had Asthma', 'Had Depressive Disorder',
               'Covid Positive', 'Flu Vaccine Last 12 Months', 'Race/Ethnicity Category', 'Had Diabetes',
               'Difficulty Dressing/Bathing', 'Chest Scan', 'Had COPD',
               'Blind or Vision Difficulty', 'High Risk Last Year', 'Had Angina',
               'Pneumonia Vaccine Ever', 'Had Skin Cancer', 'Had Arthritis', 'Deaf or Hard of Hearing',
               'Alcohol Drinkers', 'Had Kidney Disease', 'Tetanus Last 10 Years (Tdap)',
               'Smoker Status', 'Height in Meters', 'BMI', 'Had Heart Attack']

    df.columns = new_columns

    return df

df_input =  pd.read_csv('./dataset/inputs.csv')
df_outputs = pd.read_csv('./dataset/labels.csv')
df = preprocess(df_input, df_outputs)

In [3]:
def format_string(row):
    # row is a row in dataframe
    feature1 = ", ".join([f"{col}: {row[col]}" for col in row.keys()])
    return feature1

def format_string_ALL(df):
    class_0 = df[df['Had Heart Attack'] == 0]
    class_1 = df[df['Had Heart Attack'] == 1]

    class_0 = class_0.sample(frac=1)
    class_1 = class_1.sample(frac=1)

    class_0 = class_0.drop(columns=["Patient ID"])
    class_1 = class_1.drop(columns=["Patient ID"])

    # class_0 and 1 are pandas df
    # Generate features for class 0 and class 1
    feature1 = [format_string(row) for _, row in class_0.sample(frac = 1).iterrows()]
    feature2 = [format_string(row) for _, row in class_1.sample(frac = 1).iterrows()]

    return feature1, feature2

In [4]:
feature0, feature1 = format_string_ALL(df)

train_index_0 = round(len(feature0)/10)
train_index_1 = round(len(feature1)/10)
test_0 = feature0[:train_index_0]
test_1 = feature1[:train_index_1]
train_0 = feature0[train_index_0:]
train_1 = feature1[train_index_1:]

In [7]:
import itertools
import random

def chunked_iterable(iterable, chunk_size):
    """
    Yield successive chunks of size `chunk_size` from `iterable`.
    
    Args:
        iterable (list): The list to be divided into chunks.
        chunk_size (int): The size of each chunk.
    
    Yields:
        list: Chunks of the original list.
    """
    for i in range(0, len(iterable), chunk_size):
        yield iterable[i:i + chunk_size]

# Sample data
list1 = [f"Element1_{i}" for i in range(1, 128)]  # Larger list with 100 elements
list2 = [f"Element2_{i}" for i in range(1, 21)]   # Smaller list with 20 elements

chunk_size = 10  # Define the size of each chunk

# Create chunked iterators
list1_chunks = chunked_iterable(list1, chunk_size)
list2_chunks = list(chunked_iterable(list2, chunk_size))  # Convert to list for cycling

# Create an infinite cycling iterator for list2 chunks
list2_cycle = itertools.cycle(list2_chunks)

# Iterate through list1 and list2 in chunks
for idx, chunk1 in enumerate(list1_chunks, 1):
    chunk2 = next(list2_cycle)
    
    # Example comparison: Print both chunks
    print(f"Iteration {idx}:")
    print(f"List1 Chunk: {chunk1}")
    print(f"List2 Chunk: {chunk2}\n")
    
    # Replace the above print statements with your comparison logic
    # For example:
    # comparison_result = compare_chunks(chunk1, chunk2)
    # process_result(comparison_result)

Iteration 1:
List1 Chunk: ['Element1_1', 'Element1_2', 'Element1_3', 'Element1_4', 'Element1_5', 'Element1_6', 'Element1_7', 'Element1_8', 'Element1_9', 'Element1_10']
List2 Chunk: ['Element2_1', 'Element2_2', 'Element2_3', 'Element2_4', 'Element2_5', 'Element2_6', 'Element2_7', 'Element2_8', 'Element2_9', 'Element2_10']

Iteration 2:
List1 Chunk: ['Element1_11', 'Element1_12', 'Element1_13', 'Element1_14', 'Element1_15', 'Element1_16', 'Element1_17', 'Element1_18', 'Element1_19', 'Element1_20']
List2 Chunk: ['Element2_11', 'Element2_12', 'Element2_13', 'Element2_14', 'Element2_15', 'Element2_16', 'Element2_17', 'Element2_18', 'Element2_19', 'Element2_20']

Iteration 3:
List1 Chunk: ['Element1_21', 'Element1_22', 'Element1_23', 'Element1_24', 'Element1_25', 'Element1_26', 'Element1_27', 'Element1_28', 'Element1_29', 'Element1_30']
List2 Chunk: ['Element2_1', 'Element2_2', 'Element2_3', 'Element2_4', 'Element2_5', 'Element2_6', 'Element2_7', 'Element2_8', 'Element2_9', 'Element2_10']

I

In [9]:
import itertools
import random
from typing import List, Tuple, Iterator

def chunked_iterable(iterable: List, chunk_size: int) -> Iterator[List]:
    """
    Yield successive chunks of size `chunk_size` from `iterable`.
    
    Args:
        iterable (List): The list to be divided into chunks.
        chunk_size (int): The size of each chunk.
    
    Yields:
        List: Chunks of the original list.
    """
    for i in range(0, len(iterable), chunk_size):
        yield iterable[i:i + chunk_size]

def iterate_lists(
    list1: List,
    list2: List,
    chunk_size: int
) -> Iterator[Tuple[List, List]]:
    """
    Iterate through list1 and list2 in chunks. When list2 is exhausted,
    restart it. If the last chunk of list1 is smaller than chunk_size,
    list2's chunk will match this smaller size.
    
    Args:
        list1 (List): The larger list to iterate through.
        list2 (List): The smaller list to cycle through.
        chunk_size (int): The size of each chunk.
    
    Yields:
        Tuple[List, List]: A tuple containing a chunk from list1 and a corresponding chunk from list2.
    """
    list1_chunks = chunked_iterable(list1, chunk_size)
    list2_chunks = list(chunked_iterable(list2, chunk_size))
    
    if not list2_chunks:
        raise ValueError("list2 must contain at least one element.")
    
    # Create a cycling iterator for list2 chunks
    list2_cycle = itertools.cycle(list2_chunks)
    
    for chunk1 in list1_chunks:
        current_chunk_size = len(chunk1)
        
        # Get the next chunk from list2_cycle
        chunk2_full = next(list2_cycle)
        
        # If the current chunk size is less than chunk_size, adjust chunk2 accordingly
        if current_chunk_size < chunk_size:
            if current_chunk_size > len(chunk2_full):
                # If chunk2_full is smaller than current_chunk_size, adjust accordingly
                # This can happen if list2 is smaller than chunk_size
                # Cycle through list2_chunks to accumulate enough elements
                needed = current_chunk_size
                chunk2 = []
                while needed > 0:
                    next_chunk = next(list2_cycle)
                    if len(next_chunk) <= needed:
                        chunk2.extend(next_chunk)
                        needed -= len(next_chunk)
                    else:
                        chunk2.extend(next_chunk[:needed])
                        # Adjust the cycle iterator to include the remaining elements
                        # Not straightforward with itertools.cycle, so handle differently
                        # Here, we'll not handle overlapping chunks, just take as much as possible
                        needed = 0
                # Truncate to the exact needed size
                chunk2 = chunk2[:current_chunk_size]
            else:
                # If list2's chunk is large enough, slice it
                chunk2 = chunk2_full[:current_chunk_size]
        else:
            chunk2 = chunk2_full
        
        yield (chunk1, chunk2)

def main():
    # Example lists
    list1 = [f"Element1_{i}" for i in range(1, 98)]  # 97 elements
    list2 = [f"Element2_{i}" for i in range(1, 21)]   # 20 elements
    
    chunk_size = 10  # Define the size of each chunk
    
    # Initialize the iterator
    paired_chunks = iterate_lists(list1, list2, chunk_size)
    
    # Iterate and perform comparisons
    for idx, (chunk1, chunk2) in enumerate(paired_chunks, 1):
        print(f"Iteration {idx}:")
        print(f"List1 Chunk ({len(chunk1)}): {chunk1}")
        print(f"List2 Chunk ({len(chunk2)}): {chunk2}")
        
        # Example comparison: Count common elements
        common_elements = set(chunk1) & set(chunk2)
        print(f"Common Elements: {common_elements}\n")
        
        # Replace the above with your actual comparison logic

if __name__ == "__main__":
    main()

Iteration 1:
List1 Chunk (10): ['Element1_1', 'Element1_2', 'Element1_3', 'Element1_4', 'Element1_5', 'Element1_6', 'Element1_7', 'Element1_8', 'Element1_9', 'Element1_10']
List2 Chunk (10): ['Element2_1', 'Element2_2', 'Element2_3', 'Element2_4', 'Element2_5', 'Element2_6', 'Element2_7', 'Element2_8', 'Element2_9', 'Element2_10']
Common Elements: set()

Iteration 2:
List1 Chunk (10): ['Element1_11', 'Element1_12', 'Element1_13', 'Element1_14', 'Element1_15', 'Element1_16', 'Element1_17', 'Element1_18', 'Element1_19', 'Element1_20']
List2 Chunk (10): ['Element2_11', 'Element2_12', 'Element2_13', 'Element2_14', 'Element2_15', 'Element2_16', 'Element2_17', 'Element2_18', 'Element2_19', 'Element2_20']
Common Elements: set()

Iteration 3:
List1 Chunk (10): ['Element1_21', 'Element1_22', 'Element1_23', 'Element1_24', 'Element1_25', 'Element1_26', 'Element1_27', 'Element1_28', 'Element1_29', 'Element1_30']
List2 Chunk (10): ['Element2_1', 'Element2_2', 'Element2_3', 'Element2_4', 'Element2_5

In [5]:
len(train_0), len(train_1), len(test_0), len(test_1)

(161627, 9515, 17958, 1057)

In [26]:
tokenized = model_Transformer.tokenizer(train_0)

In [25]:
tokenized[0]

Encoding(num_tokens=203, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [23]:
model_Transformer.tokenizer

MPNetTokenizerFast(name_or_path='sentence-transformers/multi-qa-mpnet-base-dot-v1', vocab_size=30527, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	104: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30526: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=F

In [7]:
from torch.utils.data import TensorDataset, DataLoader

dataset_0 = TensorDataset([(x, y) for x, y in zip(train_0, [0]*len(train_0))])
loader_0 = DataLoader(dataset_0, batch_size=32, shuffle=True)

AttributeError: 'list' object has no attribute 'size'

## Transformer

In [15]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import InputExample, losses
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
model_Transformer = SentenceTransformer("multi-qa-mpnet-base-dot-v1").to(device)

## Scikit Logistic Regression Eval

In [25]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Split into training and testing sets
features, labels = batch(1000)
X = model_Transformer.encode(features)
y = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Split into training and testing sets
features, labels = batch(2000)
X_train, y_train = model_Transformer.encode(features), np.array(labels)  
X_test,  y_test  = model_Transformer.encode()


# Create and train the logistic regression model
model = LogisticRegression(penalty=None)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

         0.0       0.73      0.72      0.73       198
         1.0       0.73      0.74      0.73       202

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
