In [68]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
bbc_data = pd.read_csv("bbc_news.csv") # Read the csv file 

In [10]:
bbc_data.head() # Check out what the data looks like

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [12]:
bbc_data.info() # Some metadata about the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [14]:
 titles = pd.DataFrame(bbc_data["title"])  # Lets work with the titles data, read it into a dataframe using pandas

In [16]:
titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


In [None]:
# Clean up the data doing the following:
# Lowercase
# Remove stop words
# Remove punctuations
# Tokenize
# Lemmatizate
# Now check the data again

In [18]:
titles['lowercase'] = titles['title'].str.lower()

In [20]:
en_stopwords = stopwords.words('english')
titles['no_stopwords']=titles['lowercase'].apply(lambda x : ' '.join( word for word in x.split() if word not in en_stopwords))

In [24]:
print(titles['no_stopwords'].head())

0                                         refuse work?
1    'liz truss brief?' world reacts uk political t...
2      rationing energy nothing new off-grid community
3        hunt superyachts sanctioned russian oligarchs
4          platinum jubilee: 70 years queen 70 seconds
Name: no_stopwords, dtype: object


In [30]:
titles['no_stopwords_no_punct']=titles.apply(lambda x: re.sub(r"^\w\s",'', x['no_stopwords']), axis=1)

In [36]:
titles['tokens_raw'] = titles.apply(lambda x : word_tokenize(x['title']), axis=1)
titles['tokens_clean'] = titles.apply( lambda x : word_tokenize(x['no_stopwords_no_punct']), axis=1)

In [48]:
lemmatizer = WordNetLemmatizer()
titles['tokens_clean_lemmatized']= titles['tokens_clean'].apply(lambda tokens : [lemmatizer.lemmatize(token) for  token in tokens])

In [50]:
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,tokens_raw,tokens_clean,tokens_clean_lemmatized
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work?,"[Can, I, refuse, to, work, ?]","[refuse, work, ?]","[refuse, work, ?]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,'liz truss brief?' world reacts uk political t...,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","['liz, truss, brief, ?, ', world, reacts, uk, ...","['liz, truss, brief, ?, ', world, reacts, uk, ..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new off-grid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, off-grid, co...","[rationing, energy, nothing, new, off-grid, co..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee: 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, :, 70, years, queen, 70, s...","[platinum, jubilee, :, 70, year, queen, 70, se..."


In [56]:
#create list for just our tokens
tokens_raw_list = sum(titles['tokens_raw'], [])
tokens_clean_list = sum(titles['tokens_clean_lemmatized'],[])

# POS Tagging

In [64]:
nlp = spacy.load("en_core_web_sm") # load the model

In [66]:
spacy_doc = nlp(' '.join(tokens_raw_list)) # add the tokens to be processed to the model

In [70]:
pos_df = pd.DataFrame(columns= ['token', 'pos_tag']) # create a data frame with 2 columns

In [90]:
# iterate over every token in spacy doc and conctenate values into the dataframe
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token':token.text, 'pos_tag':token.pos_}])],ignore_index=True)

In [None]:
# Count the number of times a token appears in the document and its associated pos tag

In [126]:
# After the groupby operation is performed to aggregate rows by token and pos-tag, calculate the size and then reset index expands all
# the rows back and takes out the hierarchial structure
pos_df_counts = pos_df.groupby(['token','pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head(10)

Unnamed: 0,token,pos_tag,counts
95,:,PUNCT,1086
8,',PUNCT,600
2897,in,ADP,374
4082,to,PART,350
3268,of,ADP,344
22,-,PUNCT,332
4043,the,DET,326
1856,and,CCONJ,294
15,'s,PART,286
97,?,PUNCT,260


In [104]:
print(pos_df_counts)

token      pos_tag
:          PUNCT      1086
'          PUNCT       600
in         ADP         374
to         PART        350
of         ADP         344
                      ... 
crumbling  VERB          2
crunch     PROPN         2
Jarrod     PROPN         2
Japanese   ADJ           2
!          PUNCT         2
Length: 4368, dtype: int64


In [None]:
# How to look for most common noun tags

In [130]:
nouns = pos_df_counts[pos_df_counts.pos_tag == "NOUN"][0:10]
nouns

Unnamed: 0,token,pos_tag,counts
4267,war,NOUN,70
3552,record,NOUN,30
3416,police,NOUN,28
4356,year,NOUN,28
4316,win,NOUN,28
3061,living,NOUN,26
4009,tax,NOUN,26
2326,day,NOUN,24
3368,people,NOUN,24
2031,boss,NOUN,22


In [134]:
verbs = pos_df_counts[pos_df_counts.pos_tag=='VERB'][0:10]
verbs

Unnamed: 0,token,pos_tag,counts
3687,says,VERB,60
9,',VERB,28
2670,found,VERB,26
4317,win,VERB,24
4324,wins,VERB,20
2713,get,VERB,18
2388,dies,VERB,18
3990,take,VERB,16
2982,killed,VERB,16
3686,say,VERB,16


In [138]:
adjectives = pos_df_counts[pos_df_counts.pos_tag=='ADJ'][0:10]
adjectives

Unnamed: 0,token,pos_tag,counts
3244,new,ADJ,56
1400,Russian,ADJ,42
2606,final,ADJ,32
19,-,ADJ,28
2625,first,ADJ,24
3199,more,ADJ,20
1994,big,ADJ,18
2835,high,ADJ,18
3000,last,ADJ,16
3304,other,ADJ,16


# NER

In [148]:
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])
for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records([{'token': token.text, 'ner_tag' : token.label_}])], ignore_index=True)
    

In [150]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,Liz Truss,PERSON
1,UK,GPE
2,Rationing,PRODUCT
3,superyachts,CARDINAL
4,Russian,NORP


In [152]:
ner_df_counts = ner_df.groupby(['token', 'ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [154]:
ner_df_counts.head()

Unnamed: 0,token,ner_tag,counts
965,Ukraine,GPE,47
955,UK,GPE,36
329,England,GPE,32
819,Russian,NORP,20
957,US,GPE,19


In [160]:
people = ner_df_counts[ner_df_counts.ner_tag=='PERSON'][:10]
people

Unnamed: 0,token,ner_tag,counts
257,Covid,PERSON,9
760,Queen,PERSON,8
757,Putin,PERSON,8
169,Boris Johnson,PERSON,6
563,Liz Truss,PERSON,6
788,Rishi Sunak,PERSON,5
581,Macron,PERSON,4
762,Quiz,PERSON,4
515,Jurgen Klopp,PERSON,4
325,Emma Raducanu,PERSON,4
