In [203]:
import nltk as nltk
import nltk.corpus  
from nltk.text import Text
import pandas as pd
import re
import sys

Load data

In [204]:
path = "Downloads/news_cat.pkl"
df = pd.read_pickle(path)
df = df[df.language == 'english']
df = df.drop(columns = ['crawled','language'])
df.head()

Unnamed: 0,text,title
0,by Abhishek K Global Telehandler Market 2023 D...,Global Telehandler Market 2023 Demand by Segme...
1,favorite this post 2014 Caterpillar 314E LCR h...,2014 Caterpillar 314E LCR
2,By: MAX NISEN The Amazon health care threat ha...,"Amazon, Berkshire, JPMorgan health announcemen..."
3,QR Code Link to This Post MONTHLY PUBLIC AUCTI...,2005 Caterpillar CB534D Tandem Vibratory Rolle...
4,QR Code Link to This Post 2007 CATERPILLAR D4G...,2007 CATERPILLAR D4G LGP CAB SCREEN/SWEEPS - O...


In [205]:
#put all articles into one list
article = []
for i in range(df.shape[0]):
    for j in range(df.shape[1]):
        article.append(df.iloc[i,j])

# Part A: Basic NER: tagging words (tokens) as PERSON, ORGANIZATION, and GPE

In [206]:
entities = []
labels = []
for i in range(len(article)):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(article[i])), binary = False):
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
            labels.append(chunk.label())

entities_all = list(zip(entities, labels))

In [208]:
entities_df = pd.DataFrame(entities_all)
entities_df.columns = ["Entities", "Labels"]
persons_df = entities_df.loc[entities_df["Labels"].isin(['ORGANIZATION'])]
counts_df = persons_df.groupby('Entities').count()
counts_df = counts_df.sort_values(by=['Labels'], ascending = False)
counts_df.head(40)

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Caterpillar,99
Caterpillar Inc.,79
NYSE,63
CAT,55
Cat,36
Company,23
SEC,23
JPMorgan,22
Transportation,20
Exchange Commission,20


Clean up the list
The list produced above requires more manual clean up. We will make some assumptions: 

1) We will assume CAT and Cat are both referring to Caterpillar Inc's stock CAT. Hence we will remove these two counts together with the counts for Caterpillar Inc. and Caterpillar. 
2) We will remove the counts of "Company", "Transportation", "Securities", "Energy", "Financial Products", "LLC", "Rating", "Bank", etc.
3) We will assume that "NOT" refers to the stock of Noront Resources Ltd. Hence, we keep it in the list.


In [209]:
counts_df.loc[~counts_df.index.isin(['Caterpillar', 'Caterpillar Inc.', 'CAT', 'Cat','Company',
                      'Transportation','Securities','Energy','Financial Products',
                      'LLC','Rating','Bank','CFO Bradley','Countries', 'Dealer Mustang',
                      'Vision'])].head(20)

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
NYSE,63
SEC,23
JPMorgan,22
Exchange Commission,20
Resource Industries,20
Vista Partners,20
Construction Industries,19
NOT,16
Lincolnian Online,12
Ratings,11


# Part B: Alternative NER, separating by sentenses first, then by tokens

In [211]:
entities = []
labels = []

for i in range(len(article)):
    for sent in nltk.sent_tokenize(article[i]):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
            if hasattr(chunk, 'label'):
                entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
                labels.append(chunk.label())

entities_all_ss = list(zip(entities, labels))

In [213]:
entities_df_ss = pd.DataFrame(entities_all_ss)
entities_df_ss.columns = ["Entities", "Labels"]
persons_df_ss = entities_df_ss.loc[entities_df_ss["Labels"].isin(['ORGANIZATION'])]
counts_df_ss = persons_df_ss.groupby('Entities').count()
counts_df_ss.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df_ss.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Caterpillar Inc.,96
Caterpillar,87
NYSE,63
CAT,55
Cat,36
Company,27
SEC,23
JPMorgan,22
Exchange Commission,20
Transportation,20


This list is very similar to the previous one, hence, we are going to process with the final list from part A.

In addition, we take a look at the indexes of the counts dataframe and notice that the NER detection did a reasonable but not a good enough job. For instance, we noticed that AMZN, Amazon, and Amazon.com are listed as three different entities, so as Berkshire Hathaway and Berkshire. There are still many cases where NER did not recognize the same entity as one. Hence, if we wanted a more comprehensive and accurate count, we might consider other API (Stanford) or manually clean up the list. 