In [1]:
import collections
import pandas as pd
import numpy as np
import nferx_py.utils as utils
from tqdm import tqdm_notebook as tqdm

In [2]:
#number of sentences per extracted entity
N = 1

### Fetch sentences from Mongo

In [3]:
collection = utils.get_mongo_collection_object('drug_extraction_media_corpus',
                                               'modified_drug_extracted_sentences',
                                               username='krao', 
                                               password= 'QacGDvm9RMxp')
res = []
count = 0 
for n0 in tqdm(collection.find({}, 
                             {'url': 1,'source':1,'drugs':1, 'company_name':1, 'date_generated_on': 1}),  total = 53162):
    if 'drugs' in n0: 
        for n1 in n0['drugs']: 
            if 'extracted_sentences' in n1: 
                for n2 in n1['extracted_sentences']: 
                    count = 0
                    if (n2['source'] == 'website_sentence'): 
                        count+=1
                        if count<=N:
                            res.append({
                                        'drug_token': n1['token'],
                                        'drug': n1['matched_from'],
                                        'bert_classification': n1['bert_classification'],
                                        'sentence': n2['sentence'],
                                        'fda_approved': n1['fda_approved'], 
                                        'date_epoch': int(n0['date_generated_on']['epoch']),
                                        'date': n0['date_generated_on']['human_readable'],
                                        'source': n0['source'] if 'source' in n0 else None, 
                                        'mean': n1['mean'] if 'mean' in n1 else None,
                                        'company_name': n0['company_name'] if 'company_name' in n0 else None,
                                        'url': n0['url'] if 'url' in n0 else None
                                        }) 


HBox(children=(FloatProgress(value=0.0, max=53162.0), HTML(value='')))




In [5]:
df = pd.DataFrame(res)
df.head(5)

Unnamed: 0,drug_token,drug,bert_classification,sentence,fda_approved,date_epoch,date,source,mean,company_name,url
0,clarit,Clarit,drug,\nPractical PAT Implementation and Regulatory ...,0,1597708800,"August 18, 2020",technologynetworks,0.3555,,https://www.technologynetworks.com/drug-discov...
1,diffraction,Diffraction,not_drug,E\nv\ne\nn\nt\n \nI\nn\nf\no\nr\nm\na\nt\ni\no...,0,1597708800,"August 18, 2020",technologynetworks,0.005553,,https://www.technologynetworks.com/genomics/ev...
2,morphogenesis,Morphogenesis,not_drug,biology of cellular interactions euroconferen...,0,1597708800,"August 18, 2020",technologynetworks,0.0,,https://www.technologynetworks.com/genomics/ev...
3,panalytical,Panalytical,not_drug,event information\nmalvern panalytical is hos...,0,1597708800,"August 18, 2020",technologynetworks,0.0,,https://www.technologynetworks.com/drug-discov...
4,isothermal,Isothermal,not_drug,information\nmalvern panalytical is hosting a...,0,1597708800,"August 18, 2020",technologynetworks,0.002629,,https://www.technologynetworks.com/drug-discov...


In [6]:
#filter by date. only look for sentences from after September 15 2020
df = df[df.date_epoch > 1600216444]
#weird chars in sentence
df['sentence'] = [n.replace('\n', '') for n in df.sentence]

In [14]:
#distribution of drug vs not_drug
df[['bert_classification', 'sentence']].groupby('bert_classification').count().reset_index().sort_values(by = 'sentence', ascending = False)

Unnamed: 0,bert_classification,sentence
1,not_drug,387849
0,drug,58573


In [15]:
#distribution of sources
df[['company_name', 'sentence']].groupby('company_name').count().reset_index().sort_values(by = 'sentence', ascending = False)

Unnamed: 0,company_name,sentence
11,biospace,226907
75,statnews,76248
23,fiercebiotech,54842
20,endpts,48285
57,medpagetoday,11442
...,...,...
66,pierre-fabre,1
50,kyowakirin,1
63,otsuka-us,1
62,ono-pharma,1


In [16]:
#sampling code
def drop_level(df):
    '''helper function to handle errors'''
    to_drop = []
    if 'level_0' in df.columns:
        to_drop.append('level_0')
    if 'level_1' in df.columns:
        to_drop.append('level_1')
    return df.drop(to_drop, axis = 1)

def sentences_per_company(df, N):
    variables = ['company_name']

    counts = df[variables+["sentence"]].groupby(by = variables).count().reset_index()
    counts['sentence_count'] = counts.sentence
    counts = counts.drop('sentence', axis = 1)

    df1 = df.merge(counts, on= variables)
    df2 = df1[df1.sentence_count<N]
    df3 = df1[df1.sentence_count>=N]
    if len(df3)!=0:
        df4 = df3.groupby(by = variables, as_index = False).apply(pd.DataFrame.sample, n=N).reset_index()
        df5 = pd.concat([df2, df4]).reset_index().drop(['index', 'sentence_count'], axis = 1)   
    else: 
        df5 = df2.reset_index().drop(['index', 'sentence_count'], axis = 1)  
    return drop_level(df5)
def sentences_per_bert_classification(df, N):
    variables = ['bert_classification']

    counts = df[variables+["sentence"]].groupby(by = variables).count().reset_index()
    counts['sentence_count'] = counts.sentence
    counts = counts.drop('sentence', axis = 1)

    df1 = df.merge(counts, on= variables)
    df2 = df1[df1.sentence_count<N]
    df3 = df1[df1.sentence_count>=N]
    if len(df3)!=0:
        df4 = df3.groupby(by = variables, as_index = False).apply(pd.DataFrame.sample, n=N).reset_index()
        df5 = pd.concat([df2, df4]).reset_index().drop(['index', 'sentence_count'], axis = 1)   
    else: 
        df5 = df2.reset_index().drop(['index', 'sentence_count'], axis = 1)  
    return drop_level(df5)

In [17]:
#record drugs per sentence, to collapse df on sentence
drugs_per_sentence = {} 
for i in tqdm(df.index): 
    drug = df.drug[i]
    sentence = df.sentence[i]
    bert_class = df.bert_classification[i]
    if sentence in drugs_per_sentence: 
        drugs_per_sentence[sentence].append((drug, bert_class))
    else: 
        drugs_per_sentence[sentence] = [(drug, bert_class)]

HBox(children=(FloatProgress(value=0.0, max=446422.0), HTML(value='')))




In [18]:
drugs_per_sentence[list(drugs_per_sentence.keys())[0]]

[('UBX1325', 'drug'), ('UBX1967', 'drug')]

In [19]:
#collapse df on sentence
keep = df[['sentence']].drop_duplicates().index
df1 = df[df.index.isin(keep)]
df1['drugs'] = [drugs_per_sentence[n] for n in df1.sentence]
len(df1)

258488

In [20]:
#sample 10k sentences per company
df2 = sentences_per_company(df1, 10000)
len(df2)

65378

In [21]:
#check drug-not_drug dist again
df2[['bert_classification', 'sentence']].groupby('bert_classification').count().reset_index().sort_values(by = 'sentence', ascending = False)

Unnamed: 0,bert_classification,sentence
1,not_drug,54357
0,drug,11021


In [22]:
#fix ratio at 3:1
df3 = pd.concat([df2[df2.bert_classification == 'drug'].sample(n = 7500), 
                 df2[df2.bert_classification == 'not_drug'].sample(n = 2500)])
len(df3)

10000

In [23]:
df3.to_csv('sentences_dump.csv', index = False)