In [188]:
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 500 )

Drug stems copied from: https://druginfo.nlm.nih.gov/drugportal/jsp/drugportal/DrugNameGenericStems.jsp

In [189]:
stem = pd.read_excel("drug_stems.xlsx", dtype = str).drop("Examples", axis = 1)

Cleaning: 
- create new rows for instances where multiple stems are listed (denoted by commas or parentheses in Stem column) 
- remove word in parenthesis such as "also" 
- create column that indicates if word is a prefix, suffix, or middle word, denoted by placement of hyphen
- replace definition of drugs that reference another drug with the other drug's definition 
- Split definition by semicolon, parentheses, and commas and take first item 

## Cleaning Stem column

In [190]:
#replacing '(also' with a comma 
stem.loc[stem["Stem"].apply(lambda x: "also" in x), "Stem"] = stem.loc[stem["Stem"].apply(lambda x: "also" in x), "Stem"].str.replace(r"\(also", ",")
#creating another column with the word without parentheses i.e. (a)tadine -> atadine 
stem["Stem2"] = stem["Stem"].str.replace("\(", "").str.replace("\)", "")
#taking out parentheses from original column (a)tadine -> tadine
stem["Stem"] = stem["Stem"].str.replace("\(.*\)", "")
stem["Stem"] = stem["Stem"] + ", " + stem["Stem2"]
#split Stem by comma and creating new rows
stem = stem.assign(Stem = stem["Stem"].str.split(",")).explode("Stem")
stem["Stem"] = stem["Stem"].str.strip()
#remove remaining ')' when removing '(also' 
stem["Stem"] = stem["Stem"].str.replace(")", "")
#remove duplicate rows
stem = stem.drop_duplicates().drop("Stem2", axis = 1)

  
  after removing the cwd from sys.path.
  
  if sys.path[0] == '':


In [191]:
#creating part of speech column
stem["pos"] = stem["Stem"].apply(lambda x: "middle" if x[0] == "-" and x[-1] == "-" else
                                            "prefix" if x[-1] == "-" else
                                            "suffix")
stem[["Stem", "pos"]].sample(50)

Unnamed: 0,Stem,pos
313,peg-,prefix
51,-bactam,suffix
239,-locib,suffix
256,-micin,suffix
245,-axo-,middle
147,-estr-,middle
464,-vir-,middle
27,-arot-,middle
386,-spodar,suffix
473,-virsen,suffix


In [192]:
stem["Stem"] = stem["Stem"].str.replace("-", "")

## Cleaning Definitions

In [193]:
stem["Definition"] = stem["Definition"].str.lower()
stem_def = dict(zip(stem["Stem"], stem["Definition"]))
#replace all definitions that refers to another stem's definition by the other definition, will contain the word "see" in its definition 
stem.loc[stem["Definition"].apply(lambda x: "see" in x), "Definition"] = stem.loc[stem["Definition"].apply(lambda x: "see" in x), "Definition"].str.split(" ").apply(
    #pick first word after the word "see" as the key word to search up
    lambda x: x[1]).str.replace("[^\w]", "").apply(lambda x: stem_def[x])

  


In [198]:
#split definition by semicolon, comma, parentheses, "derivative", "analogue" and take the first split
stem["Definition"] = stem["Definition"].str.split(";").apply(lambda x: x[0])
stem["Definition"] = stem["Definition"].str.split("\(").apply(lambda x: x[0])
stem["Definition"] = stem["Definition"].str.split(",").apply(lambda x: x[0])
stem["Definition"] = stem["Definition"].str.split("analogue").apply(lambda x: x[0])
stem["Definition"] = stem["Definition"].str.split("derivative").apply(lambda x: x[0])
stem["Definition"] = stem["Definition"].str.replace("substances", "")
stem["Definition"] = stem["Definition"].str.replace("type", "").str.strip()

In [203]:
stem.to_csv("drug_stems_cleaned.csv", index = False)