In [31]:
#import torch
import numpy as np
from datasets import Dataset
import pandas as pd
from torch import nn
import transformers

#from nbtools.utils import files
#proj_dir = files.project_root()
proj_dir = '/media/john/hdd01/projects/notebooks'

# Load Dataset

In [5]:
# read in data
data_pth = f'{proj_dir}/data/census_data.xlsx'
census = pd.read_excel(data_pth, sheet_name='Census (Stays)')
diagnoses = pd.read_excel(data_pth, sheet_name='DXes')
cohorts = pd.read_excel(data_pth, sheet_name='Cohorts')

# perform inner join to see how many records match
coh_cen = pd.merge(
    cohorts, census, how='inner', on='RESIDENT ID'
)
coh_diag = pd.merge(
    cohorts, diagnoses, how='inner', on='RESIDENT ID'
)

print(f'# of cohorts that map to census: {len(coh_cen)}')
print(f'# of cohorts that map to diagnoses: {len(coh_diag)}')


# of cohorts that map to census: 0
# of cohorts that map to diagnoses: 23


In [32]:
census_cohort = pd.merge(
    census, cohorts, how='inner', on='ADMISS. ID'
)

print(census_cohort)

Empty DataFrame
Columns: [ADMISS. ID, FACILITY ID_x, RESIDENT ID_x, START DATE_x, END DATE_x, START REASON, SOURCE, SOURCE TYPE, ADMISSION ACTION DESC, ADMISSION ACTION CODE, END REASON, END REASON CODE, DESTINATION, DESTINATION TYPE, HOSPITALIZATION REASON_x, FACILITY ID_y, RESIDENT ID_y, START DATE_y, END DATE_y, HOSPITALIZATION REASON_y, PMTS COHORT]
Index: []

[0 rows x 21 columns]


In [35]:
censet = set(census['ADMISS. ID'])
cohset = set(cohorts['ADMISS. ID'])
print(censet.intersection(cohset))

set()


# Define a new model using BERT to fine-tune

In [27]:
class SentimentAnalyzer(nn.Module):
    def __init__(self, n_class):
        super(SentimentAnalyzer, self).__init__()

        # pretrained BERT model
        self.bert = transformers.BertModel.from_pretrained(
            'bert-base-uncased'
        )

        # classification head
        self.classifier = nn.Linear(768, n_class)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        out1 = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        return self.classifier(out1.pooler_output)

In [30]:
tok = transformers.BertTokenizer.from_pretrained(
    'bert-base-uncased'
)
model = SentimentAnalyzer(n_class=11)

In [19]:
sample_text = "This is a sample sentence for testing."
inputs = tok(
    sample_text,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=256
)

print(inputs['input_ids'])

tensor([[ 101, 2023, 2003, 1037, 7099, 6251, 2005, 5604, 1012,  102]])


In [29]:
model(**inputs)

tensor([[-0.2120,  0.3233]], grad_fn=<AddmmBackward0>)

In [None]:
times = {
    "meta-llama/Llama-3.2-1B-Instruct": 25.817,
    "meta-llama/Llama-3.2-3B-Instruct": 69.560,
    "meta-llama/Llama-3.1-8B-Instruct": 120.867,
    "meta-llama/Llama-3.1-70B-Instruct": 0,
}