<a href="https://colab.research.google.com/github/iamab3/Natural-Language-Processing_Text-Pre-processing/blob/main/Parts_of_Speech_and_Named_Entities_in_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy
import re
import pandas as pd
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Loading the Data

In [None]:
# Loading the data
bbc_df = pd.read_csv('/content/bbc_news.csv')

In [None]:
bbc_df.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [None]:
bbc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [None]:
# Extracting titles
Titles = pd.DataFrame(bbc_df['title'])
Titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


# Cleaning the Data


In [None]:
# Converting all to lower case
Titles['lowercase'] = Titles['title'].str.lower()
Titles.head()

Unnamed: 0,title,lowercase
0,Can I refuse to work?,can i refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...


In [None]:
# Removing stop words
en_stopwords = stopwords.words('english')
Titles['no_stopwords'] = Titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))
Titles.head()

Unnamed: 0,title,lowercase,no_stopwords
0,Can I refuse to work?,can i refuse to work?,refuse work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds


In [None]:
# Removing punctions
Titles['no_stopwords_no_punc'] = Titles.apply(lambda x: re.sub(r'[^\w\s]','',x['no_stopwords']),axis=1)
Titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punc
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds


In [None]:
# tokenizing
Titles['raw_tokens'] = Titles.apply(lambda x: word_tokenize(x['title']),axis=1)
Titles['clean_tokens'] = Titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punc']),axis=1)
Titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punc,raw_tokens,clean_tokens
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[Can, I, refuse, to, work, ?]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, 70, years, queen, 70, seco..."


In [None]:
# Lemmatizing
lemmatizer = WordNetLemmatizer()
Titles['clean_lemmatized'] = Titles.apply(lambda x: [lemmatizer.lemmatize(word) for word in x['clean_tokens']],axis=1)
Titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punc,raw_tokens,clean_tokens,lemmatized,clean_lemmatized
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[Can, I, refuse, to, work, ?]","[refuse, work]","[refuse, work]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]","[platinum, jubilee, 70, year, queen, 70, second]"


In [None]:
# Creating lists for our tokens
raw_tokens_list = sum(Titles['raw_tokens'],[])
clean_tokens_list = sum(Titles['clean_lemmatized'],[])

# Parts of Speech (POS) tagging

In [None]:
# Loading the model
nlp = spacy.load('en_core_web_sm')

In [None]:
# Tagging the raw tokens
raw_tokens_doc = nlp(' '.join(raw_tokens_list))

In [None]:
# Storing token and pos_tag in a pandas dataframe
pos_df = pd.DataFrame(columns=['token','pos_tag'])

In [None]:
# Getting the tokens and associated tags
for token in raw_tokens_doc:
  pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token':token.text,'pos_tag':token.pos_}])],ignore_index=True)

In [None]:
# Token frequency count
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts',ascending=False)
pos_df_counts.head(10)

Unnamed: 0,token,pos_tag,counts
95,:,PUNCT,543
8,',PUNCT,300
2897,in,ADP,187
4082,to,PART,175
3268,of,ADP,172
22,-,PUNCT,166
4043,the,DET,163
1856,and,CCONJ,147
15,'s,PART,143
97,?,PUNCT,130


In [None]:
# Filtering for Noun
nouns = pos_df_counts[pos_df_counts.pos_tag == 'NOUN'][0:10]
nouns

Unnamed: 0,token,pos_tag,counts
4267,war,NOUN,35
3552,record,NOUN,15
4356,year,NOUN,14
4316,win,NOUN,14
3416,police,NOUN,14
3061,living,NOUN,13
4009,tax,NOUN,13
3368,people,NOUN,12
2326,day,NOUN,12
4357,years,NOUN,11


In [None]:
# Filtering for Verb
verbs = pos_df_counts[pos_df_counts.pos_tag == 'VERB'][0:10]
verbs

Unnamed: 0,token,pos_tag,counts
3687,says,VERB,30
9,',VERB,14
2670,found,VERB,13
4317,win,VERB,12
4324,wins,VERB,10
2713,get,VERB,9
2388,dies,VERB,9
3108,make,VERB,8
2982,killed,VERB,8
3686,say,VERB,8


In [None]:
# Filtering for Adjectives
Adjec = pos_df_counts[pos_df_counts.pos_tag == 'ADJ'][0:10]
Adjec

Unnamed: 0,token,pos_tag,counts
3244,new,ADJ,28
1400,Russian,ADJ,21
2606,final,ADJ,16
19,-,ADJ,14
2625,first,ADJ,12
3199,more,ADJ,10
2835,high,ADJ,9
1994,big,ADJ,9
3000,last,ADJ,8
3304,other,ADJ,8


# Named Entity Recognition (NER)

In [None]:
# Initialising pandas df to store tokens and ner tag
ner_df = pd.DataFrame(columns=['token','ner_tag'])

In [None]:
for token in raw_tokens_doc.ents:
  if pd.isna(token.label_) == False:
    ner_df = pd.concat([ner_df, pd.DataFrame.from_records([{'token':token.text,'ner_tag':token.label_}])],ignore_index=True)

In [None]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,Liz Truss,PERSON
1,UK,GPE
2,Rationing,PRODUCT
3,superyachts,CARDINAL
4,Russian,NORP


In [None]:
# Frequency for ner tags
ner_df_counts = ner_df.groupby(['token','ner_tag']).size().reset_index(name='counts').sort_values(by='counts',ascending=False)
ner_df_counts

Unnamed: 0,token,ner_tag,counts
965,Ukraine,GPE,47
955,UK,GPE,36
329,England,GPE,32
819,Russian,NORP,20
957,US,GPE,19
...,...,...,...
1102,this year,DATE,1
1101,this weekend,DATE,1
7,'zero,DATE,1
6,'World Cup,EVENT,1


In [None]:
# Looking for most popular person
person = ner_df_counts[ner_df_counts.ner_tag == 'PERSON'][0:10]
person

Unnamed: 0,token,ner_tag,counts
257,Covid,PERSON,9
757,Putin,PERSON,8
760,Queen,PERSON,8
563,Liz Truss,PERSON,6
169,Boris Johnson,PERSON,6
788,Rishi Sunak,PERSON,5
515,Jurgen Klopp,PERSON,4
762,Quiz,PERSON,4
581,Macron,PERSON,4
325,Emma Raducanu,PERSON,4
