In [19]:
import nltk
from nltk import word_tokenize
from nltk import ngrams
from nltk.stem import PorterStemmer as ps, WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import spacy

In [2]:
BBC_data = pd.read_csv('bbc_news.csv')

In [3]:
BBC_data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [4]:
BBC_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [12]:
titles = BBC_data["title"]
titles.head()

0                                Can I refuse to work?
1    'Liz Truss the Brief?' World reacts to UK poli...
2    Rationing energy is nothing new for off-grid c...
3    The hunt for superyachts of sanctioned Russian...
4    Platinum Jubilee: 70 years of the Queen in 70 ...
Name: title, dtype: object

In [26]:
sw = stopwords.words('english')
titles_lower = BBC_data["title"].str.lower()
titles_nostopwords = titles_lower.str.split().apply(lambda words: ' '.join(
        [w for w in words if w not in sw]
    ))
titles_nopunctuation = titles_nostopwords.str.replace(
    r'[^\w\s]',
    '',
    regex=True
)
titles_tokenized = titles_nopunctuation.apply(word_tokenize)
titles_tokenized.head()
lemmatizer = WordNetLemmatizer()
titles = BBC_data[["title"]].copy()
titles["Tokens_Clean_Lemmatized"] = titles_tokenized.apply(
    lambda tokens: [lemmatizer.lemmatize(tok) for tok in tokens]
)
titles.head()

tokens_raw = titles["title"].explode()

tokens_clean = titles["Tokens_Clean_Lemmatized"].explode()

tokens_raw


0                                  Can I refuse to work?
1      'Liz Truss the Brief?' World reacts to UK poli...
2      Rationing energy is nothing new for off-grid c...
3      The hunt for superyachts of sanctioned Russian...
4      Platinum Jubilee: 70 years of the Queen in 70 ...
                             ...                        
995    Dominic Raab: Third senior civil servant gives...
996                  Highlights: Radacanu beats Uytvanck
997      In pictures: Mountain bikers descend snowy peak
998    Companies must help cut living costs, says new...
999       Beware online car sale scams, consumers warned
Name: title, Length: 1000, dtype: object

In [44]:
nlp = spacy.load('en_core_web_sm')
spacy_doc = nlp(' '.join(tokens_clean))

pos_df = pd.DataFrame(columns=['token', 'pos_tag'])

for token in spacy_doc:
    pos_df = pd.concat([pos_df, 
                        pd.DataFrame.from_records([{'token' : token.text, 'pos_tag': token.pos_}])], ignore_index=True)

pos_df_counts = (
    pos_df
    .query('pos_tag == "NOUN"')
    .groupby(['token', 'pos_tag'])
    .size()
    .reset_index()
    .rename(columns={0: 'count'})
    .sort_values(by='count', ascending=False)
)

pos_df_counts.head(10)

Unnamed: 0,token,pos_tag,count
1348,war,NOUN,34
1384,world,NOUN,30
724,man,NOUN,22
304,day,NOUN,21
1389,year,NOUN,20
410,energy,NOUN,17
976,record,NOUN,17
1378,woman,NOUN,16
405,election,NOUN,16
901,police,NOUN,16
