In [2]:
#!pip install nltk

In [3]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import os, requests, sys

import nltk as nltk
import nltk.corpus  
from nltk.text import Text
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [26]:
%%time

df = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
df.shape

CPU times: user 11.3 s, sys: 6.55 s, total: 17.9 s
Wall time: 18.3 s


(200332, 5)

In [27]:
df.head()

Unnamed: 0,url,date,language,title,text
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,en,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...
2,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce..."
3,http://www.homeoffice.consumerelectronicsnet.c...,2021-03-10,en,Strategy Analytics: 71% of Smartphones Sold Gl...,\n\nStrategy Analytics: 71% of Smartphones Sol...
4,http://www.itbusinessnet.com/2020/10/olympus-t...,2020-10-20,en,Olympus to Support Endoscopic AI Diagnosis Edu...,\n\nOlympus to Support Endoscopic AI Diagnosis...


In [28]:
df['language'].value_counts()

en    200332
Name: language, dtype: int64

In [29]:
df.drop(columns = ['language'], inplace = True)

### Check duplicate rows

In [31]:
df[df.duplicated(subset = ['title', 'text'], keep = False)].sort_values(by = 'title')

Unnamed: 0,url,date,title,text
5528,https://www.firstmonday.org/ojs/index.php/fm/a...,2021-01-20,\t\tChief information officers’ perceptions a...,\n\n\t\tChief information officers’ perception...
21392,https://firstmonday.org/ojs/index.php/fm/artic...,2021-01-16,\t\tChief information officers’ perceptions a...,\n\n\t\tChief information officers’ perception...
148412,https://www.washingtonian.com/2020/03/27/jason...,2020-03-28,Jason Reynolds Is the Bard of Black Y...,\n Jason Reynolds Is the Bard of Black ...
169488,https://www.washingtonian.com/2020/03/27/jason...,2020-03-27,Jason Reynolds Is the Bard of Black Y...,\n Jason Reynolds Is the Bard of Black ...
163013,https://www.archdaily.com/936533/6-visions-of-...,2020-03-30,6 Visions of How Artificial Intelligence will...,\n\n6 Visions of How Artificial Intelligence w...
...,...,...,...,...
112876,https://www.nme.com/news/gaming-news/escape-fr...,2022-05-11,‘Escape From Tarkov’ patch updates AI behaviou...,\n\n‘Escape From Tarkov’ patch updates AI beha...
155434,https://www.thedailybeast.com/the-andy-warhol-...,2022-02-23,‘The Andy Warhol Diaries’ Trailer: Ryan Murphy...,\n‘The Andy Warhol Diaries’ Trailer: Ryan Murp...
111969,https://www.thedailybeast.com/the-andy-warhol-...,2022-02-23,‘The Andy Warhol Diaries’ Trailer: Ryan Murphy...,\n‘The Andy Warhol Diaries’ Trailer: Ryan Murp...
14425,https://www.afaqs.com/people-spotting/advertis...,2021-05-14,“49 per cent of the surveyed pool have a relia...,\n\n“49 per cent of the surveyed pool have a r...


In [32]:
df.drop_duplicates(subset = ['title', 'text'], inplace = True)

In [33]:
df.shape

(198564, 4)

### Extract the main text body

In [44]:
def text_body(text):
    
    # split the paragraphs into lists of sentences 
    paragraphs = text.split('\n')
    
    # discard sentences after these key words, paragraphs ahead of these words can be viewed as the main text body
    exclude = ['Articles:', 'You May Also Like', 'Related Stories', 'Related Articles', 
               'Contact Us:', 'Exclusive:', 'More Stories', 'About Us:', 'PRESS RELEASE:',
               'Top 10 News', 'More Articles', 'Terms of Use', 'Privacy Policy']
    idx = 999999999

    for term in exclude:

        for index, sentence in enumerate(paragraphs):
            if term in sentence and index < idx:
                idx = index
    
    try:
        paragraphs = paragraphs[:idx]
    except:
        pass 
    
    # discard sentences that have less than 15 words
    sent = [sentence for sentence in paragraphs if len(sentence.strip().split(' ')) > 15]
    
    res = '\n'.join(sent)
    
    # remove \n and \t
    res = res.replace('\n', ' ').replace('\t', '')
    
    ### eliminate urls
    url_pattern1 = 'https://[\w\./]+' #starts with https:
    url_pattern2 = 'www.[\w\./]+' #starts with www.
    res = re.sub(url_pattern1, '', res)
    res = re.sub(url_pattern2, '', res)
    
    ### remove all right reserved
    res = re.sub('All Right Reserved', '', res)
    
    return res

In [45]:
%%time
df['main_text'] = df['text'].apply(text_body)

CPU times: user 1min 38s, sys: 1.01 s, total: 1min 39s
Wall time: 1min 39s


### Discard irrelevant news articles

In [46]:
def relevant(text):
    
    relevant = 0
    
    topics = ['Data Science', ' DS ', 'Machine Learning', 'Artificial Intelligence', ' AI ', 'data science', 'machine learning']
    
    for topic in topics:
        
        if topic in text: relevant = 1
        
    return relevant

In [47]:
df['relevant'] = df['main_text'].apply(relevant)

In [48]:
filtered = df[df['relevant'] == 1].copy().reset_index(drop = True)
filtered.shape

(127739, 6)

### Tokenize the text and clean the tokens

In [49]:
stopwords = set(nltk.corpus.stopwords.words('english'))
lemma = WordNetLemmatizer()

In [50]:
%%time
filtered['Tokens'] = filtered['main_text'].apply(lambda x: nltk.tokenize.word_tokenize(x))

CPU times: user 10min 53s, sys: 5.57 s, total: 10min 59s
Wall time: 10min 59s


In [51]:
def clean_tokens(token_ls):
    
    # Remove single-character tokens (mostly punctuation)
    clean_tokens = [word for word in token_ls if len(word) > 1]

    # Remove numbers
    clean_tokens = [word for word in clean_tokens if not word.isnumeric()]

    # Remove punctuation
    clean_tokens = [word for word in clean_tokens if word.isalpha()]

    # Remove stopwords
    clean_tokens = [word for word in clean_tokens if word not in stopwords]
    
    clean_tokens = [lemma.lemmatize(word) for word in clean_tokens]
    
    return clean_tokens

In [52]:
%%time
filtered['cleaned_tokens'] = filtered['Tokens'].apply(clean_tokens)

CPU times: user 5min 15s, sys: 769 ms, total: 5min 15s
Wall time: 5min 16s


In [53]:
table = pa.Table.from_pandas(filtered)
pq.write_table(table, './filtered_data.parquet')