In [2]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt

#nltk.download('stopwords')
#nltk.download('punkt_tab')

In [3]:
stop_words = set(stopwords.words("english"))                        # load English stopwords from NLTK
stemmer = PorterStemmer()                                           # create a new Porter stemmer

def clean_text(text):
    text = text.lower()                                             # convert to lowercase
    spaces = re.compile(r'\s+')
    text = spaces.sub(' ', text)                                    # substitute all white space characters (single or multiple occurences) with a single space

    emails = re.compile(r'\S+@\S+\.\S+')
    text = emails.sub('_EMAIL_', text)                              # substitute all found email addresses with _EMAIL_
    urls = re.compile(r'http[s]?:\/\/\S+|www\.\S+|\S+\.[a-z]+\/\S+|\w+\.(?:com|net|org)')
    text = urls.sub('_URL_', text)                                  # substitute all found URLs with _URL_
    dates = re.compile(r'''
                       \d{1,4}[-\/]\d{1,2}[-\/]\d{1,4}|
                       \d{1,2}\ (?:jan[a-z]*|feb[a-z]*|mar[a-z]*|apr[a-z]*|may|jun[e]?|jul[y]?|aug[a-z]*|sep[a-z]*|oct[a-z]*|nov[a-z]*|dec[a-z]*)\ \d{,4}|
                       (?:jan[a-z]*|feb[a-z]*|mar[a-z]*|apr[a-z]*|may|jun[e]?|jul[y]?|aug[a-z]*|sep[a-z]*|oct[a-z]*|nov[a-z]*|dec[a-z]*)[,.]?\ ?\d{1,4}(?:th|st|nd|rd)?(?:,\ \d{4})?
                       ''', re.VERBOSE)
    text = dates.sub('_DATE_', text)                                # substitute all found dates with _DATE_
    numbers = re.compile(r'\d+(?:th|st|nd|rd)?')
    text = numbers.sub('_NUM_', text)                               # substitute all remaining numbers with _NUM_
    return text

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

def stem_words(tokens):
    return [stemmer.stem(word) for word in tokens]

In [4]:
sample = pd.read_csv("news_sample.csv", usecols=['domain', 'type', 'url', 'content', 'title']) 
print(sample.info()) 

sample = sample.dropna(subset=['content', 'type'])                                    # drop rows with no content or type (/label)
sample = sample.drop(sample[sample['type'] == 'unknown'].index)                       # drop rows where 'type' is 'unknown'
sample = sample.drop(sample[sample['type'] == 'unreliable'].index)                    # drop rows where 'type' is 'unreliable'
sample = sample.drop_duplicates(subset=['content'])                                   # drop rows with duplicates in the 'content' column

print(sample.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   domain   250 non-null    object
 1   type     238 non-null    object
 2   url      250 non-null    object
 3   content  250 non-null    object
 4   title    250 non-null    object
dtypes: object(5)
memory usage: 9.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 215 entries, 1 to 246
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   domain   215 non-null    object
 1   type     215 non-null    object
 2   url      215 non-null    object
 3   content  215 non-null    object
 4   title    215 non-null    object
dtypes: object(5)
memory usage: 10.1+ KB
None


In [5]:
sample["cleaned_content"] = sample["content"].apply(clean_text)                     # cleaning the text in the content column
sample["tokens"] = sample["cleaned_content"].apply(word_tokenize)                   # tokenizing the text in the content column
sample["tokens_no_stopwords"] = sample["tokens"].apply(remove_stopwords)            # removing stopwords from the tokens 
sample["stemmed_tokens"] = sample["tokens_no_stopwords"].apply(stem_words)          # stemming the tokens 
 
# printing a preview of the raw text and step by step of the preprocessing pipeline
print(sample[["content", "cleaned_content", "tokens", "tokens_no_stopwords", "stemmed_tokens"]].head())

                                             content  \
1  AWAKENING OF 12 STRANDS of DNA – “Reconnecting...   
4  Donald Trump has the unnerving ability to abil...   
6  Could you imagine waking up in the morgue? I f...   
7  Citizen Journalist\n\nby N.Morgan Q has releas...   
8  Usa Dollar Tanks On Mnuchin Statement That He ...   

                                     cleaned_content  \
1  awakening of _NUM_ strands of dna – “reconnect...   
4  donald trump has the unnerving ability to abil...   
6  could you imagine waking up in the morgue? i f...   
7  citizen journalist by n.morgan q has released ...   
8  usa dollar tanks on mnuchin statement that he ...   

                                              tokens  \
1  [awakening, of, _NUM_, strands, of, dna, –, “,...   
4  [donald, trump, has, the, unnerving, ability, ...   
6  [could, you, imagine, waking, up, in, the, mor...   
7  [citizen, journalist, by, n.morgan, q, has, re...   
8  [usa, dollar, tanks, on, mnuchin, statement

In [6]:
size_tokenized = len(set(word for doc in sample["tokens"] for word in doc))
size_wo_stopwords = len(set(word for doc in sample["tokens_no_stopwords"] for word in doc))
size_stemmed = len(set(word for doc in sample["stemmed_tokens"] for word in doc))

stopword_reduction_rate = (size_tokenized - size_wo_stopwords) / size_tokenized * 100
stemmed_reduction_rate = (size_wo_stopwords - size_stemmed) / size_wo_stopwords * 100

print(f"Vocabulary before removing stopwords: {size_tokenized}")
print(f"Vocabulary after removing stopwords: {size_wo_stopwords}")
print(f"Reduction rate after removing stopwords: {stopword_reduction_rate:.2f}%")

print(f"Vocabulary after stemming: {size_stemmed}")
print(f"Reduction rate after stemming: {stemmed_reduction_rate:.2f}%")

Vocabulary before removing stopwords: 15585
Vocabulary after removing stopwords: 15440
Reduction rate after removing stopwords: 0.93%
Vocabulary after stemming: 10676
Reduction rate after stemming: 30.85%
