In [None]:
import re
import nltk
import string
import numpy as np
import pandas as pd
nltk.download('stopwords')
stopwords_lst = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data = pd.read_csv("/content/drive/MyDrive/Lights-on-heights/data/Drugs Master List.zip",compression='zip', header=0, sep=',', quotechar='"')

In [None]:
data.head(3)

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity,rx_otc,pregnancy_category,csa,alcohol,related_drugs,medical_condition_description,rating,no_of_reviews,drug_link,medical_condition_url
0,doxycycline,Acne,"(hives, difficult breathing, swelling in your ...",doxycycline,"Miscellaneous antimalarials, Tetracyclines","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",87%,Rx,D,N,X,amoxicillin: https://www.drugs.com/amoxicillin...,Acne Other names: Acne Vulgaris; Blackheads; B...,6.8,760.0,https://www.drugs.com/doxycycline.html,https://www.drugs.com/condition/acne.html
1,spironolactone,Acne,hives ; difficulty breathing; swelling of your...,spironolactone,"Aldosterone receptor antagonists, Potassium-sp...","Aldactone, CaroSpir",82%,Rx,C,N,X,amlodipine: https://www.drugs.com/amlodipine.h...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.2,449.0,https://www.drugs.com/spironolactone.html,https://www.drugs.com/condition/acne.html
2,minocycline,Acne,"skin rash, fever, swollen glands, flu-like sym...",minocycline,Tetracyclines,"Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",48%,Rx,D,N,,amoxicillin: https://www.drugs.com/amoxicillin...,Acne Other names: Acne Vulgaris; Blackheads; B...,5.7,482.0,https://www.drugs.com/minocycline.html,https://www.drugs.com/condition/acne.html


In [None]:
data.shape

(999, 17)

# AFTER INSPECTING THE DATASET, THE PRE-PROCESSING PHASE WILL BE SPLIT INTO TWO

###### Phase 1 entails pre-processing the dataset for a pre-trained model.
###### Phase 2 entails structring the dataset to be used in fine tuning the embedding layer of the pre-trained model. Hence, making it domain specific.

## PHASE 1

#### RENAME COLUMN

In [None]:
data.rename(columns ={"activity":"activity(%)"}, inplace=True)

In [None]:
data["activity(%)"] = data["activity(%)"].apply(lambda x: x.replace("%",""))

#### HANDLE COLUMN DATATYPE

In [None]:
data.dtypes

drug_name                         object
medical_condition                 object
side_effects                      object
generic_name                      object
drug_classes                      object
brand_names                       object
activity(%)                       object
rx_otc                            object
pregnancy_category                object
csa                               object
alcohol                           object
related_drugs                     object
medical_condition_description     object
rating                           float64
no_of_reviews                    float64
drug_link                         object
medical_condition_url             object
dtype: object

In [None]:
numerical_columns = ["activity(%)","rating","no_of_reviews"]
categorical_columns = [i for i in list(data.columns) if i not in  numerical_columns]

In [None]:
print(f'Numerical Columns: {numerical_columns} \nCategorical Columns: {categorical_columns}')

Numerical Columns: ['activity(%)', 'rating', 'no_of_reviews'] 
Categorical Columns: ['drug_name', 'medical_condition', 'side_effects', 'generic_name', 'drug_classes', 'brand_names', 'rx_otc', 'pregnancy_category', 'csa', 'alcohol', 'related_drugs', 'medical_condition_description', 'drug_link', 'medical_condition_url']


In [None]:
def handle_column_datatype(df, cols, dtype):
  for data_column in cols:
    if dtype == float:
      df[data_column] = df[data_column].astype("float")
    elif dtype == str:
       df[data_column] = df[data_column].astype("category")
  return df

In [None]:
# Handle the numerical columns
data = handle_column_datatype(data, numerical_columns,float)

In [None]:
# Handle the categorical columns
data = handle_column_datatype(data, categorical_columns, str)

In [None]:
data.dtypes

drug_name                        category
medical_condition                category
side_effects                     category
generic_name                     category
drug_classes                     category
brand_names                      category
activity(%)                       float64
rx_otc                           category
pregnancy_category               category
csa                              category
alcohol                          category
related_drugs                    category
medical_condition_description    category
rating                            float64
no_of_reviews                     float64
drug_link                        category
medical_condition_url            category
dtype: object

#### HANDLE NULL VALUES

In [None]:
data.isna().sum()

drug_name                          0
medical_condition                  0
side_effects                      28
generic_name                      12
drug_classes                      21
brand_names                      386
activity(%)                        0
rx_otc                             0
pregnancy_category                72
csa                                0
alcohol                          495
related_drugs                    508
medical_condition_description      0
rating                           492
no_of_reviews                    492
drug_link                          0
medical_condition_url              0
dtype: int64

In [None]:
missn_categorical_columns = ["side_effects","generic_name","drug_classes","brand_names","pregnancy_category","alcohol","related_drugs"]
missn_numerical_columns = ["rating","no_of_reviews"]

###### NOTE: Health data is very sensitivity data, hence not having the right professional expertise when handling null values present in the data could result in dire problems. Hence, the safest and simplest approach is to replace null values with a tag, a tag that indicates a missing value. This [paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5474942/) talks about the differnt ways of handling missing data in health data, to which the method I described above is among and performs relatively well.

In [None]:
mssn_str_tag = "MSSN"
mssn_float_val = -1.0

In [None]:
def handle_null_values(df, columns, dtype, fill_value):
  for column in columns:
    if dtype ==  float:
      df[column].fillna(fill_value,inplace=True)
    elif dtype == str:
      df[column] = df[column].cat.add_categories(fill_value).fillna(fill_value)
  return df

In [None]:
data = handle_null_values(data, missn_numerical_columns, float, mssn_float_val)

In [None]:
data = handle_null_values(data, missn_categorical_columns, str,  mssn_str_tag)

In [None]:
data.head(3)

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity(%),rx_otc,pregnancy_category,csa,alcohol,related_drugs,medical_condition_description,rating,no_of_reviews,drug_link,medical_condition_url
0,doxycycline,Acne,"(hives, difficult breathing, swelling in your ...",doxycycline,"Miscellaneous antimalarials, Tetracyclines","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",87.0,Rx,D,N,X,amoxicillin: https://www.drugs.com/amoxicillin...,Acne Other names: Acne Vulgaris; Blackheads; B...,6.8,760.0,https://www.drugs.com/doxycycline.html,https://www.drugs.com/condition/acne.html
1,spironolactone,Acne,hives ; difficulty breathing; swelling of your...,spironolactone,"Aldosterone receptor antagonists, Potassium-sp...","Aldactone, CaroSpir",82.0,Rx,C,N,X,amlodipine: https://www.drugs.com/amlodipine.h...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.2,449.0,https://www.drugs.com/spironolactone.html,https://www.drugs.com/condition/acne.html
2,minocycline,Acne,"skin rash, fever, swollen glands, flu-like sym...",minocycline,Tetracyclines,"Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",48.0,Rx,D,N,MSSN,amoxicillin: https://www.drugs.com/amoxicillin...,Acne Other names: Acne Vulgaris; Blackheads; B...,5.7,482.0,https://www.drugs.com/minocycline.html,https://www.drugs.com/condition/acne.html


#### Handle Duplicates

In [None]:
data.duplicated().sum()

0

#### HANDLE RELATED DRUGS COLUMN

###### This column contains string data and a url in one, I split this into two, the related_drug column and the related_drug_url column. I created a new dataframe for these columns. In the new dataframe, the related_drug column will the foreign key that will connect to the original dataframe.

In [None]:
def _seperated_drug_name_frm_url(drugs):
  split_tag = "https"
  batch_related_drugs = []
  batch_related_drugs_url = []
  for drug in drugs:
    drug_info = drug.split(split_tag)
    drug_name = drug_info[0].replace(":","")
    drug_name = drug_name.strip()

    drug_url = split_tag + drug_info[1]
    drug_url =  drug_url.strip()

    batch_related_drugs.append(drug_name)
    batch_related_drugs_url.append(drug_url)
  return batch_related_drugs, batch_related_drugs_url 

In [None]:
def handle_related_drugs_column(df):
  related_drugs = []
  related_drugs_url = []
  old_related_drugs = []
  for drug in df["related_drugs"].to_list():
    if drug == mssn_str_tag:
      old_related_drugs.append(mssn_str_tag)
    else:
      drugs = drug.split("|")
      drug_names, drug_urls = _seperated_drug_name_frm_url(drugs)
      old_related_drugs.append(", ".join(drug_names))
      related_drugs.extend(drug_names)
      related_drugs_url.extend(drug_urls)
  new_dict = {"related_drugs" : related_drugs, "related_drugs_url":related_drugs_url}
  df["related_drugs"] = old_related_drugs
  return df, pd.DataFrame(new_dict)

In [None]:
data, related_drugs_df = handle_related_drugs_column(data)

In [None]:
related_drugs_df.head(3)

Unnamed: 0,related_drugs,related_drugs_url
0,amoxicillin,https://www.drugs.com/amoxicillin.html
1,prednisone,https://www.drugs.com/prednisone.html
2,albuterol,https://www.drugs.com/albuterol.html


In [None]:
# HANDLE DUPLICATES FOR THE RELATED DRUGS DATAFRAME
related_drugs_df.duplicated().sum()

4003

In [None]:
related_drugs_df.drop_duplicates(keep='last', inplace=True)

In [None]:
related_drugs_df.duplicated().sum()

0

In [None]:
related_drugs_df.shape

(353, 2)

In [None]:
data.head()

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity(%),rx_otc,pregnancy_category,csa,alcohol,related_drugs,medical_condition_description,rating,no_of_reviews,drug_link,medical_condition_url
0,doxycycline,Acne,"(hives, difficult breathing, swelling in your ...",doxycycline,"Miscellaneous antimalarials, Tetracyclines","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",87.0,Rx,D,N,X,"amoxicillin, prednisone, albuterol, ciprofloxa...",Acne Other names: Acne Vulgaris; Blackheads; B...,6.8,760.0,https://www.drugs.com/doxycycline.html,https://www.drugs.com/condition/acne.html
1,spironolactone,Acne,hives ; difficulty breathing; swelling of your...,spironolactone,"Aldosterone receptor antagonists, Potassium-sp...","Aldactone, CaroSpir",82.0,Rx,C,N,X,"amlodipine, lisinopril, losartan, metoprolol, ...",Acne Other names: Acne Vulgaris; Blackheads; B...,7.2,449.0,https://www.drugs.com/spironolactone.html,https://www.drugs.com/condition/acne.html
2,minocycline,Acne,"skin rash, fever, swollen glands, flu-like sym...",minocycline,Tetracyclines,"Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",48.0,Rx,D,N,MSSN,"amoxicillin, prednisone, doxycycline, ciproflo...",Acne Other names: Acne Vulgaris; Blackheads; B...,5.7,482.0,https://www.drugs.com/minocycline.html,https://www.drugs.com/condition/acne.html
3,Accutane,Acne,problems with your vision or hearing; muscle o...,isotretinoin (oral),"Miscellaneous antineoplastics, Miscellaneous u...",MSSN,41.0,Rx,X,N,X,"doxycycline, clindamycin topical, erythromycin...",Acne Other names: Acne Vulgaris; Blackheads; B...,7.9,623.0,https://www.drugs.com/accutane.html,https://www.drugs.com/condition/acne.html
4,clindamycin,Acne,hives ; difficult breathing; swelling of your ...,clindamycin topical,"Topical acne agents, Vaginal anti-infectives","Cleocin T, Clindacin ETZ, Clindacin P, Clindag...",39.0,Rx,B,N,MSSN,"doxycycline, metronidazole, metronidazole topi...",Acne Other names: Acne Vulgaris; Blackheads; B...,7.4,146.0,https://www.drugs.com/mtm/clindamycin-topical....,https://www.drugs.com/condition/acne.html


In [None]:
data["related_drugs"][0]

'amoxicillin, prednisone, albuterol, ciprofloxacin, azithromycin, cephalexin, metronidazole, metronidazole topical, clindamycin topical, clindamycin'

### CREATE DRUG INFORMATION COLUMN

###### This section contains the features that would be used to create an elaborate embedding matrix. The search engine entails users asking about drugs and their information, hence the semantic algorithm should be able to understand the relationship between the various types of names of drugs that could be used to referrence a specific drug, and also the medical condition that drug is used for. An example is, if a user asks about 'isotretinoin', the algorithm should know that 'isotrerinoin' could also be referred to as 'Accutane'. The alogrithm should also be able to tell that the drug is used for treating acne.

###### INTUITION: To achieve this I combined the "drug_name", "drug_classes", "brand_names", "medical_condition", and "related_drugs" columns together forming a single document. When represented in a vector space, you should be able to see that drugs associated with a specific medical conditions will be clustered together. These visualizations can be found in the notebooks used in developing the embedding matrix.

In [None]:
def create_document_info_column(df, new_df=False):
  drug_information = []
  for i in range(df.shape[0]):
    # combine all the names together to form a single string or document.
    if df["drug_name"][i] == df["generic_name"][i]:
      doc = [df["drug_name"][i], df["drug_classes"][i], df["brand_names"][i],df["medical_condition"][i], df["related_drugs"][i]]
    else:
      doc = [df["drug_name"][i], df["generic_name"][i], df["drug_classes"][i], df["brand_names"][i],df["medical_condition"][i], df["related_drugs"][i]]
    doc_string = ", ".join(doc)
    drug_information.append(doc_string)
  df["drug_information"] = drug_information

  if new_df:
    temp = {"drug_information": drug_information,
            "drug_name": df["drug_name"].to_list()}
    new_dataframe = pd.DataFrame(temp)
    return df, new_dataframe

  return df, _

In [None]:
data, _ = create_document_info_column(data, new_df=False)

In [None]:
data["drug_information"][0]

'doxycycline, Miscellaneous antimalarials, Tetracyclines, Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alodox, Avidoxy, Doryx, Mondoxyne NL, Monodox, Morgidox, Okebo, Oracea, Oraxyl, Periostat Targadox, Vibramycin calcium, Vibramycin Hyclate, Vibramycin monohydrate, Vibra-Tabs, Acne, amoxicillin, prednisone, albuterol, ciprofloxacin, azithromycin, cephalexin, metronidazole, metronidazole topical, clindamycin topical, clindamycin'

### FOCUSING ON THE IMPORTANT FEATURES

In [None]:
features = ["drug_information", "medical_condition", "drug_name"]

In [None]:
sub_data = data[features]

In [None]:
sub_data.head()

Unnamed: 0,drug_information,medical_condition,drug_name
0,"doxycycline, Miscellaneous antimalarials, Tetr...",Acne,doxycycline
1,"spironolactone, Aldosterone receptor antagonis...",Acne,spironolactone
2,"minocycline, Tetracyclines, Dynacin, Minocin, ...",Acne,minocycline
3,"Accutane, isotretinoin (oral), Miscellaneous a...",Acne,Accutane
4,"clindamycin, clindamycin topical, Topical acne...",Acne,clindamycin


###### PREPROCESS THE FEATURES

In [None]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def pre_proceess_document(df):
  string.punctation = string.punctuation + "——"
  df = re.sub('\w*\d\w*', '', str(df)) # remove numbers
  df = re.sub(' +', ' ', str(df)) # remove extra whitespaces
  df = re.sub('[%s]' % re.escape(string.punctuation), ' ' , str(df)) # remove punctuations
  df = re.sub('’s', '',str(df)) # remove 's from words in sentences
  df = df.lower() # convert to lower case
  df = df.replace("mssn","") # remove the mssn tag, I did this because there are a lot of missing variables in this dataset, and since I am merging various columns together, information from the others will be sufficient.
  df = df.strip()
  return df


In [None]:
sub_data["drug_information"] = sub_data["drug_information"].apply(pre_proceess_document)
sub_data["medical_condition"] = sub_data["medical_condition"].apply(pre_proceess_document)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data["drug_information"] = sub_data["drug_information"].apply(pre_proceess_document)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data["medical_condition"] = sub_data["medical_condition"].apply(pre_proceess_document)


In [None]:
# REMOVE STOPWORDS
sub_data["drug_information"] = sub_data["drug_information"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_lst)]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data["drug_information"] = sub_data["drug_information"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_lst)]))


In [None]:
sub_data.head()

Unnamed: 0,drug_information,medical_condition,drug_name
0,doxycycline miscellaneous antimalarials tetrac...,acne,doxycycline
1,spironolactone aldosterone receptor antagonist...,acne,spironolactone
2,minocycline tetracyclines dynacin minocin mino...,acne,minocycline
3,accutane isotretinoin oral miscellaneous antin...,acne,Accutane
4,clindamycin clindamycin topical topical acne a...,acne,clindamycin


In [None]:
sub_data['drug_information'][0]

'doxycycline miscellaneous antimalarials tetracyclines acticlate adoxa ck adoxa pak adoxa tt alodox avidoxy doryx mondoxyne nl monodox morgidox okebo oracea oraxyl periostat targadox vibramycin calcium vibramycin hyclate vibramycin monohydrate vibra tabs acne amoxicillin prednisone albuterol ciprofloxacin azithromycin cephalexin metronidazole metronidazole topical clindamycin topical clindamycin'

## PHASE 2

###### In this phase, I made use of the medical_condition column as a target/label. This format is needed to fine-tune the model, making it more domain specific. Based on my analysis on the data, the drugs can be segemented into 13 sections, where each sect belongs to a medical conditon the drug is designed to cure.

##### RENAME SPECIFIC VALUES IN THE MEDICAL CONDITION COLUMN

In [None]:
med_conditions = list(sub_data["medical_condition"].unique())

In [None]:
print(med_conditions)

['acne', 'adhd', 'aids hiv', 'allergies', 'alzheimer s', 'angina', 'anxiety', 'asthma', 'bipolar disorder', 'bronchitis', 'cancer', 'cholesterol', 'colds   flu']


In [None]:
print(len(med_conditions))

13


In [None]:
rename_conditions = {'aids hiv':'aids','alzheimer s':'alzheimer','bipolar disorder': 'bipolar','colds   flu':'cold' }

In [None]:
old = list(rename_conditions.keys())
new = list(rename_conditions.values())

In [None]:
# this is done for easy interpretation 
sub_data["medical_condition"] = sub_data["medical_condition"].replace(old, new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data["medical_condition"] = sub_data["medical_condition"].replace(old, new)


In [None]:
print(list(sub_data["medical_condition"].unique()))

['acne', 'adhd', 'aids', 'allergies', 'alzheimer', 'angina', 'anxiety', 'asthma', 'bipolar', 'bronchitis', 'cancer', 'cholesterol', 'cold']


##### RENAME THE MEDICAL CONDITION COLUMN NAME TO LABEL



In [None]:
sub_data.rename(columns ={"medical_condition":"label"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data.rename(columns ={"medical_condition":"label"}, inplace=True)


In [None]:
sub_data.head()

Unnamed: 0,drug_information,label,drug_name
0,doxycycline miscellaneous antimalarials tetrac...,acne,doxycycline
1,spironolactone aldosterone receptor antagonist...,acne,spironolactone
2,minocycline tetracyclines dynacin minocin mino...,acne,minocycline
3,accutane isotretinoin oral miscellaneous antin...,acne,Accutane
4,clindamycin clindamycin topical topical acne a...,acne,clindamycin


##### CREATE A SEPERATE DATAFRAME RELATING TO DRUG INFORMATION COLUMN USING THE DRUG NAME AS A FOREIGN KEY

In [None]:
prod_feature_store = sub_data[["drug_information","drug_name"]]

In [None]:
prod_feature_store.head()

Unnamed: 0,drug_information,drug_name
0,doxycycline miscellaneous antimalarials tetrac...,doxycycline
1,spironolactone aldosterone receptor antagonist...,spironolactone
2,minocycline tetracyclines dynacin minocin mino...,minocycline
3,accutane isotretinoin oral miscellaneous antin...,Accutane
4,clindamycin clindamycin topical topical acne a...,clindamycin


##### ADDING MORE INFORMATION FOR THE FINE-TUNING PROCESS

##### THE MEDICAL CONDITION DESCRIPTION CONTAINS INFORMATION ABOUT THE MEDICAL CONDITION, WHICH WOULD ALSO BE USED AS A FEATURE TO PROVIDE MORE INFORMATION WHEN FINE-TUNING THE EMBEDDING LAYER.

In [None]:
desc_data = data[["medical_condition", "medical_condition_description"]]

In [None]:
desc_data.shape

(999, 2)

In [None]:
desc_data.head()

Unnamed: 0,medical_condition,medical_condition_description
0,Acne,Acne Other names: Acne Vulgaris; Blackheads; B...
1,Acne,Acne Other names: Acne Vulgaris; Blackheads; B...
2,Acne,Acne Other names: Acne Vulgaris; Blackheads; B...
3,Acne,Acne Other names: Acne Vulgaris; Blackheads; B...
4,Acne,Acne Other names: Acne Vulgaris; Blackheads; B...


In [None]:
# DROP DUPLICATES
desc_data.duplicated().sum()

986

In [None]:
desc_data.drop_duplicates(keep='last',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  desc_data.drop_duplicates(keep='last',inplace=True)


In [None]:
desc_data

Unnamed: 0,medical_condition,medical_condition_description
237,Acne,Acne Other names: Acne Vulgaris; Blackheads; B...
292,ADHD,ADHD (Attention Deficit Hyperactivity Disorder...
401,AIDS/HIV,HIV Infection Other names: Acquired Immune Def...
415,Allergies,Allergies Other names: Allergy; Dust Mite Alle...
442,Alzheimer's,Alzheimer's Disease Other names: Presenile Dem...
513,Angina,Angina Other names: Accelerating Angina; Angin...
558,Anxiety,Anxiety Other names: Anxiety States; Nerves; N...
603,Asthma,Asthma Other names: Bronchial Asthma; Exercise...
650,Bipolar Disorder,Bipolar Disorder Other names: Bipolar Affectiv...
721,Bronchitis,Bronchitis Bronchitis is a type of infection t...


In [None]:
# RESET INDEX
desc_data.reset_index(inplace=True)
desc_data.drop("index", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  desc_data.drop("index", axis=1, inplace=True)


In [None]:
# CLEAN UP MEDICAL CONDITION COLUMN
desc_data["medical_condition"] = desc_data["medical_condition"].apply(pre_proceess_document)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  desc_data["medical_condition"] = desc_data["medical_condition"].apply(pre_proceess_document)


In [None]:
# CLEAN UP MEDICAL CONDITION DESCRIPTION COLUMN
desc_data["medical_condition_description"] = desc_data["medical_condition_description"].apply(pre_proceess_document)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  desc_data["medical_condition_description"] = desc_data["medical_condition_description"].apply(pre_proceess_document)


In [None]:
# UPDATE MEDICAL CONDITION COLUMN VARIABLES
desc_data["medical_condition"] = desc_data["medical_condition"].replace(old, new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  desc_data["medical_condition"] = desc_data["medical_condition"].replace(old, new)


In [None]:
# CHANGE MEDICAL CONDITION COLUMN NAME TO LABEL
desc_data.rename(columns ={"medical_condition":"label"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  desc_data.rename(columns ={"medical_condition":"label"}, inplace=True)


In [None]:
# GET THE FEATURES FROM THE SUB DATA DATAFRAME THE WILL BE USED TO FINE TUNE THE EMBEDDING LAYER
fine_tune = sub_data[["drug_information", "label"]]

In [None]:
fine_tune.head()

Unnamed: 0,drug_information,label
0,doxycycline miscellaneous antimalarials tetrac...,acne
1,spironolactone aldosterone receptor antagonist...,acne
2,minocycline tetracyclines dynacin minocin mino...,acne
3,accutane isotretinoin oral miscellaneous antin...,acne
4,clindamycin clindamycin topical topical acne a...,acne


In [None]:
fine_tune["drug_information"][0]

'doxycycline miscellaneous antimalarials tetracyclines acticlate adoxa ck adoxa pak adoxa tt alodox avidoxy doryx mondoxyne nl monodox morgidox okebo oracea oraxyl periostat targadox vibramycin calcium vibramycin hyclate vibramycin monohydrate vibra tabs acne amoxicillin prednisone albuterol ciprofloxacin azithromycin cephalexin metronidazole metronidazole topical clindamycin topical clindamycin'

In [None]:
fine_tune["label"][0]

'acne'

In [None]:
desc_data.head(3)

Unnamed: 0,label,medical_condition_description
0,acne,acne other names acne vulgaris blackheads b...
1,adhd,adhd attention deficit hyperactivity disorder...
2,aids,hiv infection other names acquired immune def...


In [None]:
# RENAME THE COLUMN NAME OF THE DESC_DATA TO RESEMBLE THAT OF THE FINE_TUNE DATAFRAME, THIS IS DONE TO ENBALE EASY CONCATENATION OF THE FRAMES
desc_data.rename(columns ={"medical_condition_description":"drug_information"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  desc_data.rename(columns ={"medical_condition_description":"drug_information"}, inplace=True)


In [None]:
# CONCATENATE THE TWO DATAFRAMES TOGETHER
complete_data = [fine_tune, desc_data]

In [None]:
complete_finetuned_data = pd.concat(complete_data)

In [None]:
complete_finetuned_data.head()

Unnamed: 0,drug_information,label
0,doxycycline miscellaneous antimalarials tetrac...,acne
1,spironolactone aldosterone receptor antagonist...,acne
2,minocycline tetracyclines dynacin minocin mino...,acne
3,accutane isotretinoin oral miscellaneous antin...,acne
4,clindamycin clindamycin topical topical acne a...,acne


In [None]:
complete_finetuned_data.shape

(1012, 2)

##### SPLIT DATA INTO TRAIN AND VALIDATION

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(complete_finetuned_data, test_size=0.02, random_state=42)

In [None]:
train.shape

(991, 2)

In [None]:
val.shape

(21, 2)

In [None]:
val.head()

Unnamed: 0,drug_information,label
631,ziprasidone ziprasidone oral injection atypica...,bipolar
634,fluoxetine olanzapine fluoxetine olanzapine ps...,bipolar
963,chlorpheniramine codeine pseudoephedrine chlor...,cold
625,aripiprazole aripiprazole oral atypical antips...,bipolar
365,aptivus tipranavir protease inhibitors aids hiv,aids


In [None]:
train.to_json("/content/drive/MyDrive/Lights-on-heights/data/labeled_data_train.json", orient="records")
val.to_json("/content/drive/MyDrive/Lights-on-heights/data/labeled_data_val.json", orient="records")

In [None]:
complete_finetuned_data.to_json("/content/drive/MyDrive/Lights-on-heights/data/finetune_features.json", orient="records")

In [None]:
sub_data.to_json("/content/drive/MyDrive/Lights-on-heights/data/feature_store.json")

# SAVE TO PARQUET FORMAT - DEPLOYMENT

###### THE "mild_cleaning" FUNCTION BELOW IS USED TO LIGHTLY PRE-PROCESS CERTAIN COLUMNS IN THE MAIN DATAFRAME. THE RESEASON FOR NOT HEAVILY PRE-PROCESSING THE MAIN DATAFRAME IS BECAUSE, VALUES FROM THE DATAFRAME WILL BE RETURNED  BACK TO THE USER WHEN THEY REQUEST FOR SPECFIC INFORMATION, THESE VALUES NEED TO BE HUMAN READABLE.

In [1]:
def mild_cleaning(strr):
  strr = strr.replace("/",",")
  strr = strr.replace("|",",")
  list_strr = strr.split(",")
  list_strr = [text.strip() for text in list_strr]
  strr = ", ".join(list_strr)
  strr = re.sub(' +', ' ', strr) # remove extra whitespaces
  return strr

In [None]:
data.head(3)

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity(%),rx_otc,pregnancy_category,csa,alcohol,related_drugs,medical_condition_description,rating,no_of_reviews,drug_link,medical_condition_url,drug_information
0,doxycycline,Acne,"(hives, difficult breathing, swelling in your ...",doxycycline,"Miscellaneous antimalarials, Tetracyclines","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",87.0,Rx,D,N,X,"amoxicillin, prednisone, albuterol, ciprofloxa...",Acne Other names: Acne Vulgaris; Blackheads; B...,6.8,760.0,https://www.drugs.com/doxycycline.html,https://www.drugs.com/condition/acne.html,"doxycycline, Miscellaneous antimalarials, Tetr..."
1,spironolactone,Acne,hives ; difficulty breathing; swelling of your...,spironolactone,"Aldosterone receptor antagonists, Potassium-sp...","Aldactone, CaroSpir",82.0,Rx,C,N,X,"amlodipine, lisinopril, losartan, metoprolol, ...",Acne Other names: Acne Vulgaris; Blackheads; B...,7.2,449.0,https://www.drugs.com/spironolactone.html,https://www.drugs.com/condition/acne.html,"spironolactone, Aldosterone receptor antagonis..."
2,minocycline,Acne,"skin rash, fever, swollen glands, flu-like sym...",minocycline,Tetracyclines,"Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",48.0,Rx,D,N,MSSN,"amoxicillin, prednisone, doxycycline, ciproflo...",Acne Other names: Acne Vulgaris; Blackheads; B...,5.7,482.0,https://www.drugs.com/minocycline.html,https://www.drugs.com/condition/acne.html,"minocycline, Tetracyclines, Dynacin, Minocin, ..."


In [None]:
data.drop("drug_information", axis=1, inplace=True)

In [None]:
data.head(3)

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity(%),rx_otc,pregnancy_category,csa,alcohol,related_drugs,medical_condition_description,rating,no_of_reviews,drug_link,medical_condition_url
0,doxycycline,Acne,"(hives, difficult breathing, swelling in your ...",doxycycline,"Miscellaneous antimalarials, Tetracyclines","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",87.0,Rx,D,N,X,"amoxicillin, prednisone, albuterol, ciprofloxa...",Acne Other names: Acne Vulgaris; Blackheads; B...,6.8,760.0,https://www.drugs.com/doxycycline.html,https://www.drugs.com/condition/acne.html
1,spironolactone,Acne,hives ; difficulty breathing; swelling of your...,spironolactone,"Aldosterone receptor antagonists, Potassium-sp...","Aldactone, CaroSpir",82.0,Rx,C,N,X,"amlodipine, lisinopril, losartan, metoprolol, ...",Acne Other names: Acne Vulgaris; Blackheads; B...,7.2,449.0,https://www.drugs.com/spironolactone.html,https://www.drugs.com/condition/acne.html
2,minocycline,Acne,"skin rash, fever, swollen glands, flu-like sym...",minocycline,Tetracyclines,"Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",48.0,Rx,D,N,MSSN,"amoxicillin, prednisone, doxycycline, ciproflo...",Acne Other names: Acne Vulgaris; Blackheads; B...,5.7,482.0,https://www.drugs.com/minocycline.html,https://www.drugs.com/condition/acne.html


#### RECALL THAT THE DRUG NAME AND  RELATED DRUGS COLUMN SERVE AS FOREIGN KEYS BETWEEN DATAFRAMES, HENCE IF THESE COLUMNS ARE MILDLY CLEANED, IT MUST BE DONE ACROSS DATAFRAMES FOR CONSISTENCY.

In [None]:
# CLEAN UP THE DRUG NAME, GENERIC NAME, DRUG CLASSES, BRAND NAMES, AND  RELATED DRUGS COLUMNS
data["drug_name"] = data["drug_name"].apply(mild_cleaning)
data["generic_name"] = data["generic_name"].apply(mild_cleaning)
data["drug_classes"] = data["drug_classes"].apply(mild_cleaning)
data["brand_names"] = data["brand_names"].apply(mild_cleaning)
data["related_drugs"] = data["related_drugs"].apply(mild_cleaning)

In [None]:
data.head()

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity(%),rx_otc,pregnancy_category,csa,alcohol,related_drugs,medical_condition_description,rating,no_of_reviews,drug_link,medical_condition_url
0,doxycycline,Acne,"(hives, difficult breathing, swelling in your ...",doxycycline,"Miscellaneous antimalarials, Tetracyclines","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",87.0,Rx,D,N,X,"amoxicillin, prednisone, albuterol, ciprofloxa...",Acne Other names: Acne Vulgaris; Blackheads; B...,6.8,760.0,https://www.drugs.com/doxycycline.html,https://www.drugs.com/condition/acne.html
1,spironolactone,Acne,hives ; difficulty breathing; swelling of your...,spironolactone,"Aldosterone receptor antagonists, Potassium-sp...","Aldactone, CaroSpir",82.0,Rx,C,N,X,"amlodipine, lisinopril, losartan, metoprolol, ...",Acne Other names: Acne Vulgaris; Blackheads; B...,7.2,449.0,https://www.drugs.com/spironolactone.html,https://www.drugs.com/condition/acne.html
2,minocycline,Acne,"skin rash, fever, swollen glands, flu-like sym...",minocycline,Tetracyclines,"Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",48.0,Rx,D,N,MSSN,"amoxicillin, prednisone, doxycycline, ciproflo...",Acne Other names: Acne Vulgaris; Blackheads; B...,5.7,482.0,https://www.drugs.com/minocycline.html,https://www.drugs.com/condition/acne.html
3,Accutane,Acne,problems with your vision or hearing; muscle o...,isotretinoin (oral),"Miscellaneous antineoplastics, Miscellaneous u...",MSSN,41.0,Rx,X,N,X,"doxycycline, clindamycin topical, erythromycin...",Acne Other names: Acne Vulgaris; Blackheads; B...,7.9,623.0,https://www.drugs.com/accutane.html,https://www.drugs.com/condition/acne.html
4,clindamycin,Acne,hives ; difficult breathing; swelling of your ...,clindamycin topical,"Topical acne agents, Vaginal anti-infectives","Cleocin T, Clindacin ETZ, Clindacin P, Clindag...",39.0,Rx,B,N,MSSN,"doxycycline, metronidazole, metronidazole topi...",Acne Other names: Acne Vulgaris; Blackheads; B...,7.4,146.0,https://www.drugs.com/mtm/clindamycin-topical....,https://www.drugs.com/condition/acne.html


In [None]:
data.to_parquet('/content/drive/MyDrive/Lights-on-heights/data/db.parquet')

In [None]:
related_drugs_df.head()

Unnamed: 0,related_drugs,related_drugs_url
41,isotretinoin,https://www.drugs.com/mtm/isotretinoin.html
53,Lasix,https://www.drugs.com/lasix.html
54,bumetanide,https://www.drugs.com/mtm/bumetanide-oral-inje...
182,erythromycin,https://www.drugs.com/erythromycin.html
184,ofloxacin,https://www.drugs.com/mtm/ofloxacin.html


In [None]:
related_drugs_df["related_drugs"] = related_drugs_df["related_drugs"].apply(mild_cleaning)

In [None]:
related_drugs_df.head()

Unnamed: 0,related_drugs,related_drugs_url
41,isotretinoin,https://www.drugs.com/mtm/isotretinoin.html
53,Lasix,https://www.drugs.com/lasix.html
54,bumetanide,https://www.drugs.com/mtm/bumetanide-oral-inje...
182,erythromycin,https://www.drugs.com/erythromycin.html
184,ofloxacin,https://www.drugs.com/mtm/ofloxacin.html


In [None]:
related_drugs_df.to_parquet('/content/drive/MyDrive/Lights-on-heights/data/related_db.parquet')

In [None]:
prod_feature_store.head(3)

Unnamed: 0,drug_information,drug_name
0,doxycycline miscellaneous antimalarials tetrac...,doxycycline
1,spironolactone aldosterone receptor antagonist...,spironolactone
2,minocycline tetracyclines dynacin minocin mino...,minocycline


In [None]:
prod_feature_store["drug_name"] = prod_feature_store["drug_name"].apply(mild_cleaning)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prod_feature_store["drug_name"] = prod_feature_store["drug_name"].apply(mild_cleaning)


In [None]:
prod_feature_store.to_parquet('/content/drive/MyDrive/Lights-on-heights/data/prod_feature_db.parquet')

In [None]:
sub_data.head()

Unnamed: 0,drug_information,label,drug_name
0,doxycycline miscellaneous antimalarials tetrac...,acne,doxycycline
1,spironolactone aldosterone receptor antagonist...,acne,spironolactone
2,minocycline tetracyclines dynacin minocin mino...,acne,minocycline
3,accutane isotretinoin oral miscellaneous antin...,acne,Accutane
4,clindamycin clindamycin topical topical acne a...,acne,clindamycin


In [None]:
sub_data["drug_name"] = sub_data["drug_name"].apply(mild_cleaning)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data["drug_name"] = sub_data["drug_name"].apply(mild_cleaning)


In [None]:
sub_data.to_csv("/content/drive/MyDrive/Lights-on-heights/data/feature_store.csv")