In [None]:
!pip install openai
!pip install transformers
!pip install accelerate
!pip install numpy
!pip install pandas
!pip install datasets
!pip install torch
!pip install sentencepiece #for T5
!pip install kaggle

In [None]:
from openai import OpenAI
def get_label_categories(columns, output_column):
    client = OpenAI(api_key=os.environ["openai_key"])

    response_message = []

    while len(response_message) != 3:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": f"We need to solve a classifcation problem, "
                                            f"We have a database with the following columns {', '.join(columns)}, where {output_column} is the outcome column we want to predict."
                                            "Please generate carefully the best question and labels to use on the column we want to predict .\n"
                                            "IMPORTANT: Give only the question and the labels seperated by comma\n"
                                            "use this format: \n"
                                            "Question, positive_label, negative_label"
                 },
            ]

        )
        response_message = response.choices[0].message.content.replace('\n', '').split(',')

    return response_message[0], response_message[1], response_message[2]

In [None]:
import pandas as pd
import torch

def get_string_data(data_frame,question, label_column_name):
    texts = []
    labels = []
    for index, row in data_frame.iterrows():
        # Construct the formatted string for the current row
        row_string = ', '.join([f'{column}: {value}' for column, value in row.items() if column != label_column_name])
        fullMsg = f"{row_string}. {question}"

        texts.append(row_string)
        labelInt = int(row[label_column_name])
        labelText = positiveLabel if labelInt == 1 else negativeLabel
        labels.append(labelText)

    data_set = {
        'texts': texts,
        'labels': labels,
    }

    return data_set


In [None]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import TrainingArguments, AutoTokenizer, Trainer, DistilBertForSequenceClassification
from transformers import TrainerCallback, EarlyStoppingCallback
from transformers import T5Tokenizer, T5ForConditionalGeneration


def processDataSet(datasetPath,question, label_column_name):
  data = pd.read_csv(datasetPath)
  if len(data) > 5000:
      data = data.head(5000)
  train_data = data.sample(frac=0.8, random_state=25)  # 80% for training
  test_data = data.drop(train_data.index)   # 20% for testing

  dataset = DatasetDict(
      train = Dataset.from_dict(get_string_data(train_data, question, label_column_name)),
      test = Dataset.from_dict(get_string_data(test_data, question, label_column_name)),
  )

  def preprocess_data(examples):
    model_inputs = tokenizer(examples['texts'], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
      labels = tokenizer(examples["labels"], max_length=max_target_length,
                        truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

  tokenized_datasets = dataset.map(preprocess_data, batched=True)
  return tokenized_datasets, test_data

In [None]:
from transformers import TrainingArguments, AutoTokenizer, Trainer, DistilBertForSequenceClassification
from transformers import TrainerCallback, EarlyStoppingCallback, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainer
import sentencepiece
import numpy as np
import json

def get_trainer(tokenized_datasets):
  data_collator = DataCollatorForSeq2Seq(tokenizer)

  arguments = Seq2SeqTrainingArguments(
      output_dir="sample_hf_trainer",
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
      num_train_epochs=20,
      # evaluation_strategy="epoch",  # run validation at the end of each epoch
      save_strategy='no',
      do_eval=False,
      evaluation_strategy="no",
      learning_rate=2e-5,
      # load_best_model_at_end=True,
      seed=224,
  )

  def compute_metrics(eval_pred):
      # for T5
      """Called at the end of validation. Gives accuracy"""
      predictions, labels = eval_pred
      # Decode the predictions
      predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
      # Calculate the accuracy
      return {"accuracy": np.mean([pred == label for pred, label in zip(predictions, labels)])}


  trainer = Seq2SeqTrainer(
      model=model,
      args=arguments,
      train_dataset=tokenized_datasets['train'],
      # eval_dataset=small_tokenized_dataset['val'], # change to test when you do your final evaluation!
      # eval_dataset=eval_dataset,  # change to test when you do your final evaluation!
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      data_collator=data_collator
  )


  class LoggingCallback(TrainerCallback):
      def __init__(self, log_path):
          self.log_path = log_path

      def on_log(self, args, state, control, logs=None, **kwargs):
          _ = logs.pop("total_flos", None)
          if state.is_local_process_zero:
              with open(self.log_path, "a") as f:
                  f.write(json.dumps(logs) + "\n")


  trainer.add_callback(LoggingCallback("sample_hf_trainer/log.jsonl"))
  return trainer

In [None]:
from torch.nn.functional import softmax


def run_test_data(test_data, question, label_column_name):
  test_data_set = get_string_data(test_data, question, label_column_name)
  correct = 0
  true_positive = 0
  false_positive = 0
  true_negative = 0
  false_negative = 0
  for i, text in enumerate(test_data_set['texts']):
    label = test_data_set['labels'][i]
    inputs = tokenizer([text], max_length=max_input_length, truncation=True, return_tensors="pt").to('cuda')
    output = model.generate(**inputs, num_beams=16, do_sample=True, min_length=0, max_length=64)
    label_string = label.strip()
    labelInt = 1 if label_string == positiveLabel else 0
    y_true.append(labelInt)
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    output_string = decoded_output.strip()
    outputInt = 1 if output_string == positiveLabel else 0
    y_score.append(outputInt)
    if output_string == label_string:
      correct += 1
      if label_string == positiveLabel:
        true_positive += 1
      else:
        true_negative += 1
    else:
      if output_string == positiveLabel:
        false_positive += 1
      else:
        false_negative += 1
  length = len(test_data_set['texts'])
  print(f"predicted {correct} out of {length}. percentage {((correct / length) * 100)}")

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

def calculate_statistics():
  precision = precision_score(y_true, y_score)
  recall = recall_score(y_true, y_score)
  f1 = f1_score(y_true, y_score)

  print(f'Precision: {precision}')
  print(f'Recall: {recall}')
  print(f'F1 Score: {f1}')

In [None]:
import zipfile

def get_data_set(info):
  owner = info["owner"]
  dataset = info["dataset"]
  csvFileName = info["csvFileName"]

  datasetFullName = f"{owner}/{dataset}"
  mainFolder = "datasets"

  if owner == "competitions":
    !kaggle competitions download -c {dataset}
    mainFolder = "."
  else:
    !kaggle datasets download -d {datasetFullName}


  datasetFolder = f"/content/kaggle/{mainFolder}/{owner}/{dataset}"
  csv_file_path = f"{datasetFolder}/{csvFileName}"

  with zipfile.ZipFile(f"{datasetFolder}/{dataset}.zip", "r") as zip_ref:
    zip_ref.extract(f"{csvFileName}", datasetFolder)  # Extract to a specific directory
  return csv_file_path


In [None]:
def set_dataset_params(datasetInfo):
  question = datasetInfo["question"]
  inputSuffix = f"\nPlease answer the following question.\n{question} {positiveLabel} or {negativeLabel}?"
  maxLength = max(len(positiveLabel), len(negativeLabel))
  return inputSuffix, maxLength


In [None]:
def preprocessing(dataset, method):
  assert method in ["true-false", "yes-no", "llm-based"]

  if method == "true-false":
    dataset["positiveLabel"] = "true"
    dataset["negativeLabel"] = "false"
  elif method == "yes-no":
    dataset["positiveLabel"] = "yes"
    dataset["negativeLabel"] = "no"
  elif method == "llm-based":
    question, pos_label, neg_label = get_label_categories(dataset)
    dataset["question"] = question
    dataset["positiveLabel"] = pos_label
    dataset["negativeLabel"] = neg_label

In [None]:
import os

# upload kaggle.json with credentials to /content

os.environ["KAGGLE_CONFIG_DIR"] = "/content"

!kaggle config set -n path -v "/content/kaggle"

model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

max_input_length = 512
max_target_length = 64

datasets= [
    { # 0 means healthy
      "owner": "uciml",
      "dataset": "pima-indians-diabetes-database",
      "csvFileName": "diabetes.csv",
      "labelColumnName": "Outcome",
      "question": "Does the patient have diabetes?",
      "positiveLabel": "Sick",
      "negativeLabel": "Healthy",
    },
    { # 1 means survived
      "owner": "competitions",
      "dataset": "titanic",
      "csvFileName": "train.csv",
      "labelColumnName": "Survived",
      "question": "Did the person survive?",
      "positiveLabel": "Alive",
      "negativeLabel": "Dead",
    },
    { # 1 means heart disease
      "owner": "fedesoriano",
      "dataset": "heart-failure-prediction",
      "csvFileName": "heart.csv",
      "labelColumnName": "HeartDisease",
      "question": "Does the person have heart disease?",
      "positiveLabel": "Sick",
      "negativeLabel": "Healthy",
    },
    { # 1 means heart disease
      "owner": "mastmustu",
      "dataset": "income",
      "csvFileName": "train.csv",
      "labelColumnName": "income_>50K",
      "question": "Is the person income greater than 50K?",
      "positiveLabel": "Rich",
      "negativeLabel": "Poor",
    },
]

for dataset in datasets:
  print(f"running on dataset {dataset['dataset']}")

  y_true = [] # true labels
  y_score = [] # predicted scores
  positiveLabel = dataset["positiveLabel"] if dataset.get("positiveLabel") else "Yes"
  negativeLabel = dataset["negativeLabel"] if dataset.get("negativeLabel") else "No"
  test_data = None
  csv_file_path = get_data_set(dataset)
  question, maxLength = set_dataset_params(dataset)
  tokenized_datasets, test_data = processDataSet(csv_file_path, question, dataset['labelColumnName'])
  trainer = get_trainer(tokenized_datasets)
  trainer.train()
  trainer.save_model()
  run_test_data(test_data, question, dataset['labelColumnName'])
  calculate_statistics()