In [1]:
import pandas as pd
import numpy as np
import sqlite3
import re
from sklearn.preprocessing import LabelEncoder
from huggingface_hub import login, logout
from datasets import load_dataset



In [2]:
engine = sqlite3.connect('./src/data.db')
catalyst = pd.read_sql('SELECT * FROM catalysts', engine)


In [3]:
catalyst = catalyst.dropna(axis=0).sort_values(by=['date']).reset_index(drop=True)
catalyst


Unnamed: 0,ticker,disease,stage,date,catalyst
0,SPPI,Non-Hodgkin’s lymphoma,Approved,2009-09-04 00:00:00,"Approved September 4, 2009."
1,JAZZ,Fibromyalgia,CRL,2010-10-11 00:00:00,"CRL received October 11, 2010."
2,ASRT,Postherpetic neuralgia - shingles,Approved,2011-01-28 00:00:00,"Approved January 28, 2011."
3,ASRT,Colorectal cancer,Approved,2011-04-29 00:00:00,"Approved April 29, 2011."
4,SPPI,Colorectal cancer,Approved,2011-04-29 00:00:00,"Approved April 29, 2011."
...,...,...,...,...,...
3906,FLGT,Various cancers,Phase 1/2,2023-11-03 00:00:00,Phase 1/2 data reported that FID-007 may have ...
3907,ELTX,Solid Tumors,Phase 1/2,2023-11-03 00:00:00,Phase 1/2 data presented at SITC reported that...
3908,CUE,Wilms' Tumor (WT1)-expressing cancers,Phase 1,2023-11-03 00:00:00,Phase 1 data presented at SITC demonstrated a ...
3909,REGN,Various cancers,Phase 1,2023-11-04 00:00:00,Phase 1 safety data reported that a total of 1...


In [4]:
catalyst.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911 entries, 0 to 3910
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ticker    3911 non-null   object
 1   disease   3911 non-null   object
 2   stage     3911 non-null   object
 3   date      3911 non-null   object
 4   catalyst  3911 non-null   object
dtypes: object(5)
memory usage: 152.9+ KB


In [5]:
catalyst['stage'].unique().size


15

In [6]:
catalyst['stage'].unique()


array(['Approved', 'CRL', 'Phase 3', 'PDUFA', 'Phase 2', 'Phase 2b',
       'Phase 1b', 'Phase 2a', 'Phase 1', 'Phase 1/2', 'Phase 2/3',
       'NDA Filing', 'BLA Filing', 'Phase 1a', 'PDUFA priority review'],
      dtype=object)

In [56]:
catalyst['catalyst'][:5]


0       Approved September 4, 2009.
1    CRL received October 11, 2010.
2        Approved January 28, 2011.
3          Approved April 29, 2011.
4          Approved April 29, 2011.
Name: catalyst, dtype: object

In [46]:
re.findall(pattern="\b*approve this\w*|\b*show\w*|\b*meet\w*", string="I will approve this message because it is approving my show or meeting with meets! showing")


['approve this', 'show', 'meeting', 'meets', 'showing']

In [54]:
def special_encode(x):
    tmp_x = x.lower()
    # BULLISH
    if re.findall(pattern="\b*approv\w*|\b*show\w*|\b*meet\w*", string=x):
        return 1
    # BEARISH
    elif re.findall(pattern="\b*halt\w*|\b*fail\w*|\b*did not reach \w*|\b*no differentiation from placebo \w*|\b*no diff\w*", string=x):
        return -1
    # NEUTRAL
    else:
        return 0


In [55]:
catalyst['catalyst'].apply(special_encode).value_counts()


catalyst
 0    2908
 1     986
-1      17
Name: count, dtype: int64

In [58]:
# tmp = [f"LABEL_{i}" for i in range(catalyst['stage'].unique().size)]
# np.c_[catalyst['stage'].unique(), tmp]


In [57]:

# # Assuming 'catalyst' is your DataFrame and 'stage' is the column to be encoded

# label_encoder = LabelEncoder()

# # Fit and transform the 'stage' column to encode the stages
# catalyst['encoded_stage'] = label_encoder.fit_transform(catalyst['stage'])

# # To map the encoded labels back to 'LABEL_i'
# stage_mapping = dict(zip(label_encoder.classes_, [f"LABEL_{i}" for i in range(label_encoder.classes_.size)]))
# catalyst['label'] = catalyst['stage'].map(stage_mapping)

# # Display the DataFrame with encoded stages and their corresponding labels
# print(catalyst[['stage', 'encoded_stage', 'label']])


In [9]:
df = catalyst.copy(deep=True)
df.head()


Unnamed: 0,ticker,disease,stage,date,catalyst,encoded_stage,label
0,SPPI,Non-Hodgkin’s lymphoma,Approved,2009-09-04 00:00:00,"Approved September 4, 2009.",0,LABEL_0
1,JAZZ,Fibromyalgia,CRL,2010-10-11 00:00:00,"CRL received October 11, 2010.",2,LABEL_2
2,ASRT,Postherpetic neuralgia - shingles,Approved,2011-01-28 00:00:00,"Approved January 28, 2011.",0,LABEL_0
3,ASRT,Colorectal cancer,Approved,2011-04-29 00:00:00,"Approved April 29, 2011.",0,LABEL_0
4,SPPI,Colorectal cancer,Approved,2011-04-29 00:00:00,"Approved April 29, 2011.",0,LABEL_0


In [11]:
train_size = 0.8
row_len = int(train_size * catalyst.shape[0])
train = catalyst.iloc[:row_len, :]
test = catalyst.iloc[row_len:, :]
print(train.shape)
print(test.shape)
print(catalyst.shape)


(3128, 7)
(783, 7)
(3911, 7)


In [12]:
train.to_csv("train.csv", index=0)
test.to_csv("test.csv", index=0)
catalyst.to_csv("catalyst.csv", index=0)


In [15]:
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
dataset = load_dataset("roymgabriel/BioPharma")
dataset


Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/606k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ticker', 'disease', 'stage', 'date', 'catalyst', 'encoded_stage', 'label'],
        num_rows: 3128
    })
    test: Dataset({
        features: ['ticker', 'disease', 'stage', 'date', 'catalyst', 'encoded_stage', 'label'],
        num_rows: 783
    })
})

In [18]:
logout()
