<a href="https://colab.research.google.com/github/jnrkufuor/apollo/blob/Ryan/NER_RH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import Packages

In [1]:
# install flair
!pip install flair

# load basic packages
import pandas as pd
from itertools import combinations, product
import string
import re
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

# load Flair and NLTK
import torch
from flair.data import Sentence
from flair.models import SequenceTagger
from nltk import tokenize

import nltk
nltk.download('punkt')

# is cuda available?
torch.cuda.is_available()

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/f0/3a/1b46a0220d6176b22bcb9336619d1731301bc2c75fa926a9ef953e6e4d58/flair-0.8.0.post1-py3-none-any.whl (284kB)
[K     |█▏                              | 10kB 24.5MB/s eta 0:00:01[K     |██▎                             | 20kB 31.1MB/s eta 0:00:01[K     |███▌                            | 30kB 23.8MB/s eta 0:00:01[K     |████▋                           | 40kB 21.4MB/s eta 0:00:01[K     |█████▊                          | 51kB 22.4MB/s eta 0:00:01[K     |███████                         | 61kB 16.3MB/s eta 0:00:01[K     |████████                        | 71kB 17.1MB/s eta 0:00:01[K     |█████████▏                      | 81kB 17.5MB/s eta 0:00:01[K     |██████████▍                     | 92kB 15.6MB/s eta 0:00:01[K     |███████████▌                    | 102kB 16.8MB/s eta 0:00:01[K     |████████████▋                   | 112kB 16.8MB/s eta 0:00:01[K     |█████████████▉                  | 122kB 16

True

## 2. Load Flair NER model

In [2]:
#Load NER Model
tagger = SequenceTagger.load('ner')

2021-03-24 15:40:13,174 --------------------------------------------------------------------------------
2021-03-24 15:40:13,178 The model key 'ner' now maps to 'https://huggingface.co/flair/ner-english' on the HuggingFace ModelHub
2021-03-24 15:40:13,182  - The most current version of the model is automatically downloaded from there.
2021-03-24 15:40:13,186  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/ner/en-ner-conll03-v0.4.pt)
2021-03-24 15:40:13,195 --------------------------------------------------------------------------------


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=432176557.0, style=ProgressStyle(descri…


2021-03-24 15:40:22,597 loading file /root/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4


## 3. Load Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/My Drive/file_name.csv')
#Above two lines may have to be changed to import your data depending on where it is
#and what the file name is
df

Mounted at /content/drive


Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,12/31/2016,2016,12,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,6/19/2017,2017,6,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,1/6/2017,2017,1,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,4/10/2017,2017,4,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,1/2/2017,2017,1,,"SEOUL, South Korea — North Korea’s leader, ..."
...,...,...,...,...,...,...,...,...,...,...
994,988,18405,Letters From Jacqueline Kennedy to the Man She...,New York Times,Steven Erlanger,2/9/2017,2017,2,,"LONDON — In November 1967, four years after..."
995,989,18406,Why Do Cooks Love the Instant Pot? I Bought On...,New York Times,Melissa Clark,2/1/2017,2017,2,,People have fallen in love with their Instant ...
996,990,18407,Downside of Being a Global Hub: Invasive Speci...,New York Times,Sarah Maslin Nir,2/15/2017,2017,2,,"Some are disarmingly named, like the cutesy Ch..."
997,991,18408,"Where Have You Gone, Angelina Jolie? Celebriti...",New York Times,Ruth La Ferla,2/8/2017,2017,2,,"Was it just a year ago that Katie Holmes, Jenn..."


#4. Remove pronouns

In [4]:

pronouns = ['I', 'You', 'It', 'He', 'She', 'We', 'They']
suffixes = ["", "’m", "’re", "’s", "’ve", "’d", "'m", "'re", "'s", "'ve", "'d", "m", "re", "s", "ve", "d"]

contraptions = [(p, s) for p in pronouns for s in suffixes]

df_contraptions = pd.DataFrame(contraptions, columns=['pronoun', 'suffix'])

df_contraptions['contraption'] = df_contraptions.apply(lambda x: x['pronoun'] + x['suffix'], axis=1)

contraptions = df_contraptions.contraption.values


## 4. Define NER function

In [5]:
# define function

def get_ner_data(df_row):
    '''
    - function to extract named entities from a paragraph
    - returns two data frames:
        - the first is a dataframe of all unique entities (persons and orgs)
        - the second is the links between the entities
    '''
    paragraph=df_row.content
    #changed above row
    # remove newlines and odd characters
    paragraph = re.sub('\r', '', paragraph)
    paragraph = re.sub('\n', ' ', paragraph)
    paragraph = re.sub("’s", '', paragraph)
    paragraph = re.sub("“", '', paragraph)
    paragraph = re.sub("”", '', paragraph)

    
    # tokenise sentences
    sentences = tokenize.sent_tokenize(paragraph)
    sentences = [Sentence(sent) for sent in sentences]
    
    # predict named entities
    for sent in sentences:
        tagger.predict(sent)
    
    # collect sentence NER's to list of dictionaries
    sent_dicts = [sentence.to_dict(tag_type='ner') for sentence in sentences]
    
    # collect entities and types
    entities = []
    types = []
    for sent_dict in sent_dicts:
        entities.extend([entity['text'] for entity in sent_dict['entities']])
        types.extend([str(entity['labels'])[1:4] for entity in sent_dict['entities']])
   #The above line is what I changed from the default notebook to get things working     
    
    # create dataframe of entities (nodes)
    df_ner = pd.DataFrame(data={'entity': entities, 'type': types})
    df_ner = df_ner[df_ner['type'].isin(['ORG'])]
    df_ner = df_ner[df_ner['entity'].map(lambda x: isinstance(x, str))]
    df_ner = df_ner[~df_ner['entity'].isin(df_contraptions['contraption'].values)]
    df_ner['entity'] = df_ner['entity'].map(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    #df_ner['entity'] = df_ner.apply(lambda x: x['entity'].split(' ')[len(x['entity'].split(' '))-1] if x['type']=='PER' else x['entity'], axis=1)
    df_ner = df_ner.drop_duplicates().sort_values('entity')
    
    # get entity combinations
    combs = list(combinations(df_ner['entity'], 2))
    
    # create dataframe of relationships (edges)
    df_links = pd.DataFrame(data=combs, columns=['from', 'to'])
    
    #Adding information to links for data tracking and visualization- use one section OR the other depending on data source
 
    df_links['title']=df_row.title
    df_links['date']=df_row.date
    #Use these two for Kaggle datasets- they do not have URLs since they are old now, so we use title instead.

    #df_links['url']=df_row.link
    #df_links['date']=(df_row.datetime)[0:10]
    #df_links['time']=(df_row.datetime)[10:25]
    #Use these for any of our own datasets pulled using Goose (they have URLs and datetimes)
    
    return df_ner, df_links

## 5. Apply function

In [None]:
df_domain = df.groupby('media').agg({'content': 'count'}).reset_index()
df_domain.columns = ['media', 'count']
df_domain = df_domain.sort_values('count', ascending=False)
dfd_small=df_domain.iloc[1:21,:]

dfd_small

# g2 = sns.barplot(data=dfd_small,
#              x='count',
#              y='Domain',
#              dodge=False,
#              orient='h',
#              hue='count',
#              palette='viridis')

# g2.set_yticks([])
# g2.set_title('Number of articles from each provider')
# g2.set_xlabel('Count')
# g2.set_ylabel('')
# g2.set_xlim(0, max(dfd_small['count'])+150)
# g2.legend_.remove()
# g2.tick_params(labelsize=5)

# for i in dfd_small.index:
#             g2.text(df_domain.iloc[i]['count']+5, i+0.25, df_domain.iloc[i]['Domain'], fontsize=8)

# sns.despine()
# g2.get_figure().savefig('domain_plot.png', dpi=1000)

Unnamed: 0,media,count
233,SoccerNurds,23
325,Yahoo Finance,22
183,NeighborWebSJ,22
251,The Bisouv Network,14
21,Bloomberg,8
236,State Reviewer,8
172,Murphy's Hockey Law,7
253,The Courier,7
250,The Baxter Report,7
200,Patch.com,6


In [6]:
df_ner = pd.DataFrame()
df_links = pd.DataFrame()



for row in tqdm(df.iloc[0:20,:].itertuples(index=False)):
#changed above row
  try:
    df_ner_temp, df_links_temp = get_ner_data(row)

    df_ner = df_ner.append(df_ner_temp)
    df_links = df_links.append(df_links_temp)
  except:
    continue



20it [01:43,  5.19s/it]


In [7]:
# praph=df['content'].iloc[1]
# praph = re.sub('\r', '', praph)
# praph = re.sub('\n', ' ', praph)
# praph = re.sub("’s", '', praph)
# praph = re.sub("“", '', praph)
# praph = re.sub("”", '', praph)

    
# # tokenise sentences
# sentences = tokenize.sent_tokenize(praph)
# sentences = [Sentence(sent) for sent in sentences]

# ## predict named entities
# for sent in sentences:
#     tagger.predict(sent)
    
# # # collect sentence NER's to list of dictionaries
# sent_dicts = [sentence.to_dict(tag_type='ner') for sentence in sentences]



# # # collect entities and types
# entities = []
# types = []
# for sent_dict in sent_dicts:
#     entities.extend([entity['text'] for entity in sent_dict['entities']])
#     types.extend([str(entity['labels'])[1:4] for entity in sent_dict['entities']])

# types


Unnamed: 0,from,to,title,date
0,Congress,Congressional,House Republicans Fret About Winning Their Hea...,12/31/2016
1,Congress,Constitution,House Republicans Fret About Winning Their Hea...,12/31/2016
2,Congress,District of Columbia Circuit,House Republicans Fret About Winning Their Hea...,12/31/2016
3,Congress,House,House Republicans Fret About Winning Their Hea...,12/31/2016
4,Congress,Justice Department,House Republicans Fret About Winning Their Hea...,12/31/2016
...,...,...,...,...
8,Observatory Group,University of Hong Kong,"Modi’s Cash Ban Brings Pain, but Corruption-We...",1/2/2017
9,Observer Research Foundation,University of Hong Kong,"Modi’s Cash Ban Brings Pain, but Corruption-We...",1/2/2017
0,ISIL,ISIS,Suicide Bombing in Baghdad Kills at Least 36 -...,1/3/2017
1,ISIL,United States State Department,Suicide Bombing in Baghdad Kills at Least 36 -...,1/3/2017


# 6. Remove plurals and possessives

In [None]:
def remove_s(entity, entity_series):
  if (entity[-1] == 's') & (entity[:-1] in entity_series):
    return entity[:-1]
  else:
    return entity



In [None]:
df_links['to'] = df_links['to'].map(lambda x: remove_s(x, df_ner['entity'].values))
df_links['from'] = df_links['from'].map(lambda x: remove_s(x, df_ner['entity'].values))
df_ner['entity_cl'] = df_ner['entity'].map(lambda x: remove_s(x, df_ner['entity'].values))



In [None]:
df_links[df_links['to'].str.contains('They')]

Unnamed: 0,from,to,url,date,time


# 7. Export Data

In [None]:
df_ner.to_csv('/content/drive/My Drive/file_name.csv', index=False)
df_links.to_csv('/content/drive/My Drive/file_name.csv', index=False)

#Use the above two lines to write the critical dataframes to csv files in your 
#google drive account. I recommend changing the file names.