In [6]:
import pandas as pd
import numpy as np
import sqlite3
import torch
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import pipeline
from transformers import AutoModelForCausalLM # Zero-shot LLaMA-2-7B
from transformers import TrainingArguments
from transformers import Trainer

from google.colab import drive

from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import json


# Modify and push files
from huggingface_hub import login, logout
from huggingface_hub import HfApi

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
catalysts = pd.read_csv('./drive/MyDrive/Colab Notebooks/final_cats.csv').drop(labels='Unnamed: 0', axis=1)
# sec = pd.read_csv('./drive/MyDrive/Colab Notebooks/sec.csv').drop(labels='Unnamed: 0', axis=1)

# Shifting catalyst Labels
Catalyst labels Map:
-1 -> 0 : BEARISH
0 -> 1: NEUTRAL
1 -> 2: BULLISH

In [66]:
catalysts['label'] = catalysts['label'].astype(int) + 1
features = catalysts.iloc[:, :-1]
labels = catalysts.iloc[:, -1:]

smote = SMOTE(random_state=42)
X_train_index, X_test_index, y_train, y_test = train_test_split(features.index.tolist(), labels, test_size=0.2, random_state=42, stratify=labels)
X_train_index, X_test_index = np.reshape(X_train_index, (-1, 1)), np.reshape(X_test_index, (-1, 1))
X_train_index, y_train = smote.fit_resample(X_train_index, y_train)

print(f"Train Size = {X_train_index.shape[0]}")
print(f"Test Size = {X_test_index.shape[0]}")

Train Size = 6450
Test Size = 785


In [67]:
y_train.value_counts()

label
0        2150
1        2150
2        2150
dtype: int64

In [68]:
X_train = catalysts.iloc[X_train_index.flatten(), :-1].reset_index(drop=True)
X_test = catalysts.iloc[X_test_index.flatten(), :-1].reset_index(drop=True)

In [73]:
train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
test = pd.concat([X_test, y_test.reset_index(drop=True)], axis=1).reset_index(drop=True)

In [74]:
train.head()

Unnamed: 0,ticker,disease,stage,date,catalyst,label
0,QSAM,Bone cancer,Phase 1,8/16/23,Initial Phase 1 data reported that treatment w...,2
1,BIIB,Alport Syndrome,CRL,2/25/22,"CRL issued February 25, 2022.",1
2,BHC,Acne,Approved,12/19/19,"FDA Approval announced December 19, 2019.",2
3,VERA,IgA nephropathy (IgAN),Phase 2a,11/5/22,Phase 2a final data presented at the American ...,2
4,CNTX,Endometrial Cancer,Phase 2,11/9/22,Phase 2 interim data demonstrated preliminary ...,2


In [75]:
test.head()

Unnamed: 0,ticker,disease,stage,date,catalyst,label
0,CANF,"Advanced liver cancer, hepatocellular carcinoma",Phase 2,12/20/21,Phase 2 data reported a complete response with...,1
1,PTCT,Spinal Muscular Atrophy (SMA) Type 1,Approved,8/7/20,"FDA Approval announced August 7, 2020.",2
2,LGND,Chronic obstructive pulmonary disease (COPD),Phase 3,12/20/22,ENHANCE-1 Phase 3 topline data met its primary...,1
3,ATXS,"Hereditary angioedema (HAE), healthy volunteers",Phase 1a,2/24/23,Phase 1a trial data reported a half-life of 11...,1
4,MRTX,Renal cell carcinoma (RCC),Phase 1/2,2/15/20,Phase 2 updated data presented at ASCO GU 2020...,1


In [76]:
train.to_csv("./drive/MyDrive/Colab Notebooks/train.csv", index=False)
test.to_csv("./drive/MyDrive/Colab Notebooks/test.csv", index=False)

In [77]:
train.shape

(6450, 6)

In [78]:
test.shape

(785, 6)