In [1]:
import pandas as pd
import numpy as np
import sqlite3
import re
from sklearn.preprocessing import LabelEncoder
from huggingface_hub import login, logout
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import warnings
warnings.filterwarnings('ignore')

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

from bs4 import BeautifulSoup
import torch

import re, glob, gc

import logging
logging.getLogger("pytorch_pretrained_bert.tokenization").setLevel(logging.ERROR)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/MicheleOrlandi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2023-11-06 12:59:03.292872: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
engine = sqlite3.connect('./src/data.db')
catalyst = pd.read_sql('SELECT * FROM catalysts', engine)

In [3]:
catalyst

Unnamed: 0,ticker,disease,stage,date,catalyst
0,SNSE,Various cancers,Phase 1,2023-11-04 00:00:00,Phase 1 safety data reported that a total of 1...
1,REGN,Various cancers,Phase 1,2023-11-04 00:00:00,Phase 1 safety data reported that a total of 1...
2,AUPH,Lupus Nephritis,Phase 3,2023-11-03 00:00:00,Phase 3 data from ASN reported that treated pa...
3,IPHA,Mycosis Fungoides,Phase 2,2023-11-03 00:00:00,Phase 2 data from ASH abstract reported that t...
4,HOWL,Solid Tumors,Phase 1b,2023-11-03 00:00:00,Phase 1/1b preliminary data provided compellin...
...,...,...,...,...,...
3906,SPPI,Colorectal cancer,Approved,2011-04-29 00:00:00,"Approved April 29, 2011."
3907,ASRT,Colorectal cancer,Approved,2011-04-29 00:00:00,"Approved April 29, 2011."
3908,ASRT,Postherpetic neuralgia - shingles,Approved,2011-01-28 00:00:00,"Approved January 28, 2011."
3909,JAZZ,Fibromyalgia,CRL,2010-10-11 00:00:00,"CRL received October 11, 2010."


In [4]:
catalyst.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911 entries, 0 to 3910
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ticker    3911 non-null   object
 1   disease   3911 non-null   object
 2   stage     3911 non-null   object
 3   date      3911 non-null   object
 4   catalyst  3911 non-null   object
dtypes: object(5)
memory usage: 152.9+ KB


In [5]:
catalyst['stage'].unique().size

15

In [6]:
catalyst['stage'].unique()

array(['Approved', 'CRL', 'Phase 3', 'PDUFA', 'Phase 2', 'Phase 2b',
       'Phase 1b', 'Phase 2a', 'Phase 1', 'Phase 1/2', 'Phase 2/3',
       'NDA Filing', 'BLA Filing', 'Phase 1a', 'PDUFA priority review'],
      dtype=object)

In [56]:
catalyst['catalyst'][:5]

0       Approved September 4, 2009.
1    CRL received October 11, 2010.
2        Approved January 28, 2011.
3          Approved April 29, 2011.
4          Approved April 29, 2011.
Name: catalyst, dtype: object

In [46]:
re.findall(pattern="\b*approve this\w*|\b*show\w*|\b*meet\w*", string="I will approve this message because it is approving my show or meeting with meets! showing")


['approve this', 'show', 'meeting', 'meets', 'showing']

In [54]:
def special_encode(x):
    tmp_x = x.lower()
    # BULLISH
    if re.findall(pattern="\b*approv\w*|\b*show\w*|\b*meet\w*", string=x):
        return 1
    # BEARISH
    elif re.findall(pattern="\b*halt\w*|\b*fail\w*|\b*did not reach \w*|\b*no differentiation from placebo \w*|\b*no diff\w*", string=x):
        return -1
    # NEUTRAL
    else:
        return 0


In [55]:
catalyst['catalyst'].apply(special_encode).value_counts()


catalyst
 0    2908
 1     986
-1      17
Name: count, dtype: int64

In [17]:
## Download pretrained finBERT model
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/TinySapBERT-from-TinyPubMedBERT-v1.0")
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/TinySapBERT-from-TinyPubMedBERT-v1.0")
classifier = pipeline('feature-extraction', model=model, tokenizer=tokenizer, device=torch.device('cpu'), framework='pt', truncation=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/TinySapBERT-from-TinyPubMedBERT-v1.0 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
score = classifier(catalyst['catalyst'][0])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [20]:
catalyst['catalyst'][0]

'Phase 1 safety data reported that a total of 11 adverse events (including one serious adverse event not considered related to SNS-101) was reported in five patients, with no dose-limiting toxicities observed, noted November 4, 2023.'

In [19]:
score

[[0.1612229198217392, -0.09487667679786682]]

In [13]:
classifier.__dir__()

['task',
 'model',
 'tokenizer',
 'feature_extractor',
 'image_processor',
 'modelcard',
 'framework',
 'device',
 'torch_dtype',
 'binary_output',
 'call_count',
 '_batch_size',
 '_num_workers',
 '_preprocess_params',
 '_forward_params',
 '_postprocess_params',
 '__module__',
 '__doc__',
 'return_all_scores',
 'function_to_apply',
 '__init__',
 '_sanitize_parameters',
 '__call__',
 'preprocess',
 '_forward',
 'postprocess',
 '__abstractmethods__',
 '_abc_impl',
 'default_input_names',
 'save_pretrained',
 'transform',
 'predict',
 'device_placement',
 'ensure_tensor_on_device',
 '_ensure_tensor_on_device',
 'check_model_type',
 'get_inference_context',
 'forward',
 'get_iterator',
 'run_multi',
 'run_single',
 'iterate',
 '__dict__',
 '__weakref__',
 '__slots__',
 '__repr__',
 '__hash__',
 '__str__',
 '__getattribute__',
 '__setattr__',
 '__delattr__',
 '__lt__',
 '__le__',
 '__eq__',
 '__ne__',
 '__gt__',
 '__ge__',
 '__new__',
 '__reduce_ex__',
 '__reduce__',
 '__subclasshook__',
 '

In [9]:
df = catalyst.copy(deep=True)
df.head()

Unnamed: 0,ticker,disease,stage,date,catalyst,encoded_stage,label
0,SPPI,Non-Hodgkin’s lymphoma,Approved,2009-09-04 00:00:00,"Approved September 4, 2009.",0,LABEL_0
1,JAZZ,Fibromyalgia,CRL,2010-10-11 00:00:00,"CRL received October 11, 2010.",2,LABEL_2
2,ASRT,Postherpetic neuralgia - shingles,Approved,2011-01-28 00:00:00,"Approved January 28, 2011.",0,LABEL_0
3,ASRT,Colorectal cancer,Approved,2011-04-29 00:00:00,"Approved April 29, 2011.",0,LABEL_0
4,SPPI,Colorectal cancer,Approved,2011-04-29 00:00:00,"Approved April 29, 2011.",0,LABEL_0


In [11]:
train_size = 0.8
row_len = int(train_size * catalyst.shape[0])
train = catalyst.iloc[:row_len, :]
test = catalyst.iloc[row_len:, :]
print(train.shape)
print(test.shape)
print(catalyst.shape)


(3128, 7)
(783, 7)
(3911, 7)


In [12]:
train.to_csv("train.csv", index=0)
test.to_csv("test.csv", index=0)
catalyst.to_csv("catalyst.csv", index=0)


In [15]:
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
dataset = load_dataset("roymgabriel/BioPharma")
dataset


Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/606k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ticker', 'disease', 'stage', 'date', 'catalyst', 'encoded_stage', 'label'],
        num_rows: 3128
    })
    test: Dataset({
        features: ['ticker', 'disease', 'stage', 'date', 'catalyst', 'encoded_stage', 'label'],
        num_rows: 783
    })
})

In [18]:
logout()
