In [1]:
from operator import index
import os
from pathlib import Path
import pandas as pd
from collections import Counter

import spacy
from spacy import displacy
from spacy.tokens import DocBin
import json
from datetime import datetime
from tqdm import tqdm
import re
import html
import nltk
lemma = nltk.wordnet.WordNetLemmatizer()
import hunspell
hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')
from krovetzstemmer import Stemmer as krovetzStemmer
krovetz = krovetzStemmer()
spacynlp = spacy.load("en_core_web_sm")
spacylemma = lambda x: " ".join([token.lemma_ for token in spacynlp(x)])
symbol_free = lambda x: " ".join(x.lower().translate({ord(c):" " for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+–"}).split())

# specify the GPU to use
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
inputPath = './NER/'
outPath = './NER_AnnoSuite/'
modelPath = "./NER_AnnoSuite/model_expand_all/model-best"

In [2]:
colors = {
        "D-spi": "lightblue",
        "D-rac": "lightblue",
        "D-gen": "lightblue",
        "D-sxo": "lightblue",
        "D-soc": "lightblue",
        "D-age": "lightblue",
        "B-ces": "#b9db57",
        "B-tme": "#b9db57",
        "B-use": "#b9db57",
        "B-int": "#b9db57",
        "B-pcp": "#b9db57",
        "B-hlt": "#b9db57",
        "B-exp": "#b9db57",
        "B-prv": "#b9db57",
        "B-stm": "#b9db57",
        "M-mth": "#57db94",
        "M-sts": "#57db94",
        "M-dat": "#57db94",
        "C-con": "#5784db",
        "C-chm": "#5784db",
        "C-flv": "#5784db",
        "C-dgn": "#5784db",
        "C-oth": "#5784db",
        "T-etc": "#c957db",
        "T-com": "#c957db",
        "T-oth": "#c957db",
        "T-mkt": "#c957db",
        "P-reg": "lightyellow",
        "P-bod": "lightyellow",
        "P-lic": "lightyellow",
        "P-mkt": "lightyellow",
        "P-red": "lightyellow",
        "P-trt": "lightyellow",
        "P-lbl": "lightyellow",
        "R-rel": "#62d835",
        "L-loc": "#ff2600",
    }

options = {
    "ents": [
        "D-spi",
        "D-rac",
        "D-gen",
        "D-sxo",
        "D-soc",
        "D-age",
        "B-ces",
        "B-tme",
        "B-use",
        "B-int",
        "B-pcp",
        "B-hlt",
        "B-exp",
        "B-prv",
        "B-stm",
        "M-mth",
        "M-sts",
        "M-dat",
        "C-con",
        "C-chm",
        "C-flv",
        "C-dgn",
        "C-oth",
        "T-etc",
        "T-com",
        "T-oth",
        "T-mkt",
        "P-reg",
        "P-bod",
        "P-lic",
        "P-mkt",
        "P-red",
        "P-trt",
        "P-lbl",
        "R-rel",
        "L-loc",
    ],
    "colors": colors,
}

In [3]:
def EntityPredictDisplay(test_txt, display=True):
    # pass our test instance into the trained pipeline
    doc = nlp_output(test_txt)

    # customize the label colors
    colors = {
        "D-spi": "lightblue",
        "D-rac": "lightblue",
        "D-gen": "lightblue",
        "D-sxo": "lightblue",
        "D-soc": "lightblue",
        "D-age": "lightblue",
        "B-ces": "#b9db57",
        "B-tme": "#b9db57",
        "B-use": "#b9db57",
        "B-int": "#b9db57",
        "B-pcp": "#b9db57",
        "B-hlt": "#b9db57",
        "B-exp": "#b9db57",
        "B-prv": "#b9db57",
        "B-stm": "#b9db57",
        "M-mth": "#57db94",
        "M-sts": "#57db94",
        "M-dat": "#57db94",
        "C-con": "#5784db",
        "C-chm": "#5784db",
        "C-flv": "#5784db",
        "C-dgn": "#5784db",
        "C-oth": "#5784db",
        "T-etc": "#c957db",
        "T-com": "#c957db",
        "T-oth": "#c957db",
        "T-mkt": "#c957db",
        "P-reg": "lightyellow",
        "P-bod": "lightyellow",
        "P-lic": "lightyellow",
        "P-mkt": "lightyellow",
        "P-red": "lightyellow",
        "P-trt": "lightyellow",
        "P-lbl": "lightyellow",
        "R-rel": "#62d835",
        "L-loc": "#ff2600",
    }

    options = {
        "ents": [
            "D-spi",
            "D-rac",
            "D-gen",
            "D-sxo",
            "D-soc",
            "D-age",
            "B-ces",
            "B-tme",
            "B-use",
            "B-int",
            "B-pcp",
            "B-hlt",
            "B-exp",
            "B-prv",
            "B-stm",
            "M-mth",
            "M-sts",
            "M-dat",
            "C-con",
            "C-chm",
            "C-flv",
            "C-dgn",
            "C-oth",
            "T-etc",
            "T-com",
            "T-oth",
            "T-mkt",
            "P-reg",
            "P-bod",
            "P-lic",
            "P-mkt",
            "P-red",
            "P-trt",
            "P-lbl",
            "R-rel",
            "L-loc",
        ],
        "colors": colors,
    }

    # visualize the identified entities
    # svg = displacy.render(doc, style="dep", options=options, jupyter=True)
    if display:
        displacy.render(doc, style="ent", options=options)
    return doc

In [4]:
nlp_output = spacy.load(modelPath)



In [5]:
df_test = pd.read_csv(inputPath + "testDataset.csv", encoding="utf8").fillna("")
print(len(df_test))

300


In [6]:
# sheet_id = "1L30o1enMWRW---ILpZ5_4UzdygQb6tbgFUVobAJ0TI0"
# sheet_name = "Sheet1"
# url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
# df = pd.read_csv(url)
df = pd.read_csv('/home/nbook/Projects/Tobacco/LGBTQ/lgbtq_articles_clean.csv')
# df

In [7]:
num = 18#6
record = df.loc[num,]
# txt = " ".join(txt.split())  # to remove extra space (2 or more spaces)
txt = record["Title"] + '. ' + record["Abstract"] #+ str(record["Highlights"])
# txt = (
#     txt.replace(", ", " , ")
#     .replace(". ", " . ")
#     .replace("? ", " ? ")
#     .replace("(", "( ")
#     .replace(")", " )")
#     .replace("/", " / ")
# )
txt = txt.replace("+", " ")
txt = " ".join(txt.split())  # to remove extra space (2 or more spaces)

ner = EntityPredictDisplay(txt)

In [48]:
a = displacy.render(ner, style="ent", jupyter=False, options=options)

In [52]:
with open('test.html', 'w') as f:
    f.write(a)

In [57]:
df_ents = pd.DataFrame(columns=['PaperIdx', 'Conference','Year','Title','PaperType','Link','AuthorNames','AuthorAffiliation', 'Text', 'Entity', 'Start', 'End', 'Label'])
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    txt = row["Title"] + '. ' + row["Abstract"] + str(row["Highlights"])
    txt = txt.replace("+", " ")
    txt = " ".join(txt.split())  # to remove extra space (2 or more spaces)
    ner = EntityPredictDisplay(txt, display=False)
    html_out = displacy.render(ner, style="ent", jupyter=False, options=options)
    html_name = f'[{idx}] {" ".join(row["Title"].replace("/", " or ").split(" ")[:5])}'
    with open(f'./NER/LGBTQColoredText/{html_name}.html', 'w') as f:
        f.write(html_out)
    for ent in ner.ents:
        df_ents.loc[df_ents.shape[0]] = {
            'PaperIdx': idx,
            'Conference':row['Conference'], 'Year':row['Year'], 'Title':row['Title'], 'PaperType':row['PaperType'], 'Link':row['Link'], 
            'AuthorNames':row['AuthorNames'], 'AuthorAffiliation':row['AuthorAffiliation'], 
            'Text':txt, 'Entity':ent.text, 'Start':ent.start_char, 'End':ent.end_char, 'Label':ent.label_
        }

100%|██████████| 33/33 [00:05<00:00,  5.77it/s]


In [7]:
df_ents.to_csv('/home/nbook/Projects/Tobacco/LGBTQ/lgbtq_articles_clean_ents.csv', index=False)

In [34]:
with open(inputPath + 'LGBTQDatasetNER.csv', 'w') as f:
    f.writelines('index,named_entities\n')
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        doc = EntityPredictDisplay(f"{row['Abstract']} {row['Highlights']}")
        ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
        f.writelines(str(row['Link']) + ',' + str(ents) + '\n') 

100%|██████████| 68/68 [00:04<00:00, 15.96it/s]


In [7]:
lookup_set = []
entities_clean = pd.read_csv('/home/nbook/Projects/Tobacco/TobaccoResearch_Meng/NER/TobaccoNamedEntitiesV3.csv')
for e in entities_clean.entities.values:
    lookup_set.extend(e.split(','))
lookup_set = [spacylemma(symbol_free(i)) for i in lookup_set]
lookup_set = list(set(lookup_set))
# for i, w in enumerate(lookup_set):
#     # w_list = w.split(' ')
#     # for j, sub_w in enumerate(w_list):
#     #     if len(hobj.stem(sub_w)) > 0:
#     #         w_list[j] = hobj.stem(sub_w)[0].decode()
#     #     else:
#     #         w_list[j] = lemma.lemmatize(sub_w)
#     # lookup_set[i] = " ".join(w_list)
#     lookup_set[i] = spacylemma(w)

In [8]:
df = pd.read_csv('/home/nbook/Projects/Tobacco/LGBTQ/lgbtq_articles_clean_ents.csv')
df

Unnamed: 0,PaperIdx,Conference,Year,Title,PaperType,Link,AuthorNames,AuthorAffiliation,Text,Entity,Start,End,Label,NewDiscover,BaseEntity
0,0,Tobacco Control,2015,Disadvantaged youth and smoking in mature toba...,Review Article,http://dx.doi.org/10.1136/tobaccocontrol-2014-...,Marita Hefler; Simon Chapman;,"A27-School of Public Health, University of Syd...",Disadvantaged youth and smoking in mature toba...,youth,14,19,D-age,True,youth
1,0,Tobacco Control,2015,Disadvantaged youth and smoking in mature toba...,Review Article,http://dx.doi.org/10.1136/tobaccocontrol-2014-...,Marita Hefler; Simon Chapman;,"A27-School of Public Health, University of Syd...",Disadvantaged youth and smoking in mature toba...,tobacco control,42,57,P-reg,True,tobacco control
2,0,Tobacco Control,2015,Disadvantaged youth and smoking in mature toba...,Review Article,http://dx.doi.org/10.1136/tobaccocontrol-2014-...,Marita Hefler; Simon Chapman;,"A27-School of Public Health, University of Syd...",Disadvantaged youth and smoking in mature toba...,systematic review,70,87,M-mth,True,systematic review
3,0,Tobacco Control,2015,Disadvantaged youth and smoking in mature toba...,Review Article,http://dx.doi.org/10.1136/tobaccocontrol-2014-...,Marita Hefler; Simon Chapman;,"A27-School of Public Health, University of Syd...",Disadvantaged youth and smoking in mature toba...,synthesis,92,101,M-mth,True,synthesis
4,0,Tobacco Control,2015,Disadvantaged youth and smoking in mature toba...,Review Article,http://dx.doi.org/10.1136/tobaccocontrol-2014-...,Marita Hefler; Simon Chapman;,"A27-School of Public Health, University of Syd...",Disadvantaged youth and smoking in mature toba...,qualitative,105,116,M-mth,True,qualitative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,32,Tobacco Prevention & Cessation,2020,Comparing smoking behavior between female-to-m...,RESEARCH PAPER,http://dx.doi.org/10.18332/tpc/114513,Irene Tamí-Maury; Anushree Sharma; Minxing Che...,"Department of Epidemiology, Human Genetics and...",Comparing smoking behavior between female-to-m...,transgender,69,80,D-gen,True,transgender
1967,32,Tobacco Prevention & Cessation,2020,Comparing smoking behavior between female-to-m...,RESEARCH PAPER,http://dx.doi.org/10.18332/tpc/114513,Irene Tamí-Maury; Anushree Sharma; Minxing Che...,"Department of Epidemiology, Human Genetics and...",Comparing smoking behavior between female-to-m...,adults,81,87,D-age,True,adult
1968,32,Tobacco Prevention & Cessation,2020,Comparing smoking behavior between female-to-m...,RESEARCH PAPER,http://dx.doi.org/10.18332/tpc/114513,Irene Tamí-Maury; Anushree Sharma; Minxing Che...,"Department of Epidemiology, Human Genetics and...",Comparing smoking behavior between female-to-m...,transgender,107,118,D-gen,True,transgender
1969,32,Tobacco Prevention & Cessation,2020,Comparing smoking behavior between female-to-m...,RESEARCH PAPER,http://dx.doi.org/10.18332/tpc/114513,Irene Tamí-Maury; Anushree Sharma; Minxing Che...,"Department of Epidemiology, Human Genetics and...",Comparing smoking behavior between female-to-m...,2015,204,208,B-tme,False,2015


In [9]:
for idx, row in df.iterrows():
    # check if the entity is in the lookup set
    df.loc[idx, 'NewDiscover'] = True if (
            spacylemma(symbol_free(row['Entity']))
            in lookup_set
        ) else False
    df.loc[idx, 'BaseEntity'] = spacylemma(symbol_free(row['Entity']))

In [11]:
df.loc[df.BaseEntity.isin(['lgbt', 'lgb', 'glbt', 'lgbti', 'lgbtq', 'non lgb', 'non lgbt', 'non lgbtq']), ['Label']] = 'D-sxo'

In [12]:
df.to_csv('/home/nbook/Projects/Tobacco/LGBTQ/lgbtq_articles_clean_ents.csv', index=False)

In [3]:
spacylemma(symbol_free('women'))

'woman'

# Evaluation

In [None]:
test_text = pd.read_csv('/home/nbook/Projects/Tobacco/LGBTQ/lgbtq_articles_clean.csv')

In [8]:
from spacy.scorer import Scorer
def evaluate(ner_model, samples):
    scorer = Scorer(ner_model)
    example = []
    for sample in samples:
        pred = ner_model(sample['text'])
        print(pred, sample['entities'])
        temp_ex = Example.from_dict(pred, {'entities': sample['entities']})
        example.append(temp_ex)
    scores = scorer.score(example)
    
    return scores

evaluate(nlp_output, )

# -------------------

In [None]:
df = pd.read_csv('/home/nbook/Projects/Tobacco/LGBTQ/ERCountPerPaper.csv')
df['BaseEntity'] = df['BaseEntity'].apply(lambda x: spacylemma(symbol_free(x)))
df

In [32]:
df2 = pd.read_csv('/home/nbook/Projects/Tobacco/LGBTQ/LGBTQEntitiesClean.csv')
df2.fillna(method='ffill')

Unnamed: 0,MajorCategory,SubCategory,NamedEntityClean,NamedEntity,FrequencyByAbstract
0,"Tobacco Use Behaviors, Prevalence, and Outcomes",Tobacco Cessation,Quitting Behavior,"Abstinence, Cessation, Quit, Smoke Cessation",
1,"Tobacco Use Behaviors, Prevalence, and Outcomes",Tobacco Cessation,Quit Attempt,Quit Attempt,
2,"Tobacco Use Behaviors, Prevalence, and Outcomes",Tobacco Cessation,Quit Rate,Quit Rate,
3,"Tobacco Use Behaviors, Prevalence, and Outcomes",Tobacco Cessation,Relapse,Relapse,
4,"Tobacco Use Behaviors, Prevalence, and Outcomes",Tobacco Cessation,Advice to Quit,Advice to Quit,
...,...,...,...,...,...
176,Tobacco Characteristics,Chemical,Additive,Additive,
177,Tobacco Characteristics,Flavor,Alcohol,Alcohol,
178,Tobacco Characteristics,Flavor,Menthol,Menthol,
179,Tobacco Characteristics,Flavor,Non-menthol,Non-menthol,


In [33]:
for idx, row in df2.iterrows():
    entities = row['NamedEntity'].split(', ')
    total = 0
    for e in entities:
        base_e = spacylemma(symbol_free(e))
        lookup = df.loc[df.BaseEntity == base_e, 'Frequency'].values
        assert len(lookup) == 1
        total += lookup[0]
    df2.loc[idx, 'FrequencyByAbstract'] = total

In [35]:
df2['FrequencyByAbstract'] = df2['FrequencyByAbstract'].astype(int)

In [37]:
df2.to_csv('/home/nbook/Projects/Tobacco/LGBTQ/LGBTQEntitiesClean.csv', index=False)

# --------------------------

In [47]:
name_map = {
    'Tobacco Use Behaviors, Prevalence, and Outcomes':'Behavioral',
    'Population Characteristics':'Demographic',
    'Geographic Locations':'Location',
    'Method and Inference':'Method',
    'Tobacco Products':'Product',
    'Relation Statement':'Relation',
    'Tobacco Characteristics':'Tobacco',
    'Policy':'Policy',
}

In [70]:
clean = pd.read_csv('/home/nbook/Projects/Tobacco/LGBTQ/ERCountPerPaper.csv')
clean = clean.fillna(method='ffill')
# clean['NamedEntity'] = clean['NamedEntity'].apply(lambda x: spacylemma(symbol_free(x)))
raw = pd.read_csv('/home/nbook/Projects/Tobacco/LGBTQ/LGBTQEntities.csv')
raw['BaseEntity'] = raw['BaseEntity'].apply(lambda x: spacylemma(symbol_free(x)))

In [71]:
for idx, row in clean.iterrows():
    entities = [spacylemma(symbol_free(x)) for x in row.NamedEntity.split(', ')]
    raw_main_cat = name_map[row['MajorCategory']]
    paper_set = []
    for e in entities:
        resulst = raw.loc[(raw.BaseEntity == e) & (raw.MainCat == raw_main_cat), 'PaperIdx'].to_list()
        if len(resulst) == 0:
            resulst = raw.loc[(raw.BaseEntity == e), 'PaperIdx'].to_list()
        paper_set.extend(resulst)
    paper_set = list(set(paper_set))
    clean.loc[idx, 'NewFrequency'] = len(paper_set)
clean['NewFrequency'] = clean['NewFrequency'].astype(int)

In [72]:
clean.to_csv('/home/nbook/Projects/Tobacco/LGBTQ/ERCountPerPaperMergedNE.csv', index=False)

In [65]:
row = clean.iloc[59]
row

MajorCategory          Population Characteristics
SubCategory                            Age Groups
NamedEntityClean                        ≥16 Years
NamedEntity                             ≥16 Years
FrequencyByAbstract                             1
NewFrequency                                    0
Name: 59, dtype: object

In [69]:
raw_main_cat = name_map[row['MajorCategory']]
raw_main_cat

'Demographic'

In [67]:
entities = [spacylemma(symbol_free(x)) for x in row.NamedEntity.split(', ')]
entities

['≥16 year']

In [68]:
raw.loc[(raw.BaseEntity == entities[0]) & (raw.MainCat == raw_main_cat), 'PaperIdx'].to_list()

[]