In [13]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('popular')
nltk.download('stopwords')
nltk.download('punkt')  # This is the correct resource
nltk.download('punkt_tab')



[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to C:\Users\Jora
[nltk_data]    |     Ismaili\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to C:\Users\Jora
[nltk_data]    |     Ismaili\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to C:\Users\Jora
[nltk_data]    |     Ismaili\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to C:\Users\Jora
[nltk_data]    |     Ismaili\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to C:\Users\Jora
[nltk_data]    |     Ismaili\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading pack

True

In [14]:
sample = pd.read_csv("news_sample.csv")  

def clean_text(text):
    text = text.lower()                                             # convert to lowercase
    spaces = re.compile(r'\s+')
    text = spaces.sub(' ', text)                                    # substitute all white space characters (single or multiple occurences) with a single space

    emails = re.compile(r'\S+@\S+\.\S+')
    text = emails.sub('EMAIL', text)                              # substitute all found email addresses with EMAIL
    urls = re.compile(r'http[s]?:\/\/\S+|www\.\S+|\S+\.[a-z]+\/\S+|\w+\.(?:com|net|org)')
    text = urls.sub('URL', text)                                  # substitute all found URLs with URL
    dates = re.compile(r'''
                       \d{1,4}[-\/]\d{1,2}[-\/]\d{1,4}|
                       \d{1,2}\ (?:jan[a-z]*|feb[a-z]*|mar[a-z]*|apr[a-z]*|may|jun[e]?|jul[y]?|aug[a-z]*|sep[a-z]*|oct[a-z]*|nov[a-z]*|dec[a-z]*)\ \d{,4}|
                       (?:jan[a-z]*|feb[a-z]*|mar[a-z]*|apr[a-z]*|may|jun[e]?|jul[y]?|aug[a-z]*|sep[a-z]*|oct[a-z]*|nov[a-z]*|dec[a-z]*)[,.]?\ ?\d{1,4}(?:th|st|nd|rd)?(?:,\ \d{4})?
                       ''', re.VERBOSE)
    text = dates.sub('DATE', text)                                # substitute all found dates with DATE
    numbers = re.compile(r'\d+(?:th|st|nd|rd)?')
    text = numbers.sub('NUM', text)                               # substitute all remaining numbers with NUM
    return text

sample["manual_cleaned_content"] = sample["content"].apply(clean_text)
print(sample[["content", "manual_cleaned_content"]].head())         # printing a preview of the raw and processed text

                                             content  \
0  Sometimes the power of Christmas will make you...   
1  AWAKENING OF 12 STRANDS of DNA – “Reconnecting...   
2  Never Hike Alone: A Friday the 13th Fan Film U...   
3  When a rare shark was caught, scientists were ...   
4  Donald Trump has the unnerving ability to abil...   

                              manual_cleaned_content  
0  sometimes the power of christmas will make you...  
1  awakening of NUM strands of dna – “reconnectin...  
2  never hike alone: a friday the NUM fan film us...  
3  when a rare shark was caught, scientists were ...  
4  donald trump has the unnerving ability to abil...  


In [15]:

def tokenize(text):
    tokens = word_tokenize(text)
    return [word for word in tokens if word.isalnum() or word.startswith("_")]  # Keep only words and numbers

# Apply the tokenization function to your DataFrame (assuming `sample` is your DataFrame)
sample["tokens"] = sample["manual_cleaned_content"].apply(tokenize)

print(sample[["manual_cleaned_content", "tokens"]].head())



                              manual_cleaned_content  \
0  sometimes the power of christmas will make you...   
1  awakening of NUM strands of dna – “reconnectin...   
2  never hike alone: a friday the NUM fan film us...   
3  when a rare shark was caught, scientists were ...   
4  donald trump has the unnerving ability to abil...   

                                              tokens  
0  [sometimes, the, power, of, christmas, will, m...  
1  [awakening, of, NUM, strands, of, dna, reconne...  
2  [never, hike, alone, a, friday, the, NUM, fan,...  
3  [when, a, rare, shark, was, caught, scientists...  
4  [donald, trump, has, the, unnerving, ability, ...  


In [16]:
stop_words = set(stopwords.words("english"))  # load English stopwords from NLTK

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

sample["tokens_no_stopwords"] = sample["tokens"].apply(remove_stopwords)
print(sample[["tokens", "tokens_no_stopwords"]].head())

                                              tokens  \
0  [sometimes, the, power, of, christmas, will, m...   
1  [awakening, of, NUM, strands, of, dna, reconne...   
2  [never, hike, alone, a, friday, the, NUM, fan,...   
3  [when, a, rare, shark, was, caught, scientists...   
4  [donald, trump, has, the, unnerving, ability, ...   

                                 tokens_no_stopwords  
0  [sometimes, power, christmas, make, wild, wond...  
1  [awakening, NUM, strands, dna, reconnecting, m...  
2  [never, hike, alone, friday, NUM, fan, film, u...  
3  [rare, shark, caught, scientists, left, blunde...  
4  [donald, trump, unnerving, ability, ability, c...  


In [17]:
stemmer = PorterStemmer()

def stem_words(tokens):
    return [stemmer.stem(word) for word in tokens]

sample["stemmed_tokens"] = sample["tokens_no_stopwords"].apply(stem_words)
print(sample[["tokens_no_stopwords", "stemmed_tokens"]].head())

                                 tokens_no_stopwords  \
0  [sometimes, power, christmas, make, wild, wond...   
1  [awakening, NUM, strands, dna, reconnecting, m...   
2  [never, hike, alone, friday, NUM, fan, film, u...   
3  [rare, shark, caught, scientists, left, blunde...   
4  [donald, trump, unnerving, ability, ability, c...   

                                      stemmed_tokens  
0  [sometim, power, christma, make, wild, wonder,...  
1  [awaken, num, strand, dna, reconnect, movi, re...  
2  [never, hike, alon, friday, num, fan, film, us...  
3  [rare, shark, caught, scientist, left, blunder...  
4  [donald, trump, unnerv, abil, abil, creat, rea...  


In [18]:
size_tokenized = len(set(word for doc in sample["tokens"] for word in doc))
size_wo_stopwords = len(set(word for doc in sample["tokens_no_stopwords"] for word in doc))
size_stemmed = len(set(word for doc in sample["stemmed_tokens"] for word in doc))

stopword_reduction_rate = (size_tokenized - size_wo_stopwords) / size_tokenized * 100
stemmed_reduction_rate = (size_wo_stopwords - size_stemmed) / size_wo_stopwords * 100

print(f"Vocabulary before removing stopwords: {size_tokenized}")
print(f"Vocabulary after removing stopwords: {size_wo_stopwords}")
print(f"Reduction rate after removing stopwords: {stopword_reduction_rate:.2f}%")

print(f"Vocabulary after stemming: {size_stemmed}")
print(f"Reduction rate after stemming: {stemmed_reduction_rate:.2f}%")

Vocabulary before removing stopwords: 14993
Vocabulary after removing stopwords: 14847
Reduction rate after removing stopwords: 0.97%
Vocabulary after stemming: 9704
Reduction rate after stemming: 34.64%


In [19]:
from sklearn.model_selection import train_test_split

# Define X (features) and y (labels)
X = sample["stemmed_tokens"]            # Feature data
y = sample["type"]                      # Target data

# Step 1: Split into 80% Train and 20% Temp (Val + Test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.20, random_state=104, shuffle=True)

# Step 2: Split 20% Temp into 10% Val and 10% Test
X_val, X_test_final, y_val, y_test_final = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print Shapes
print('X_train shape:', X_train.shape)
print('X_val shape:', X_val.shape)
print('X_test shape:', X_test_final.shape)

print('')
print('y_train shape:', y_train.shape)
print('y_val shape:', y_val.shape)
print('y_test shape:', y_test_final.shape)

# Print Samples
print("\nValidation Set Sample:\n", X_val.head())
print("\nTest Set Sample:\n", X_test_final.head())

X_train shape: (200,)
X_val shape: (25,)
X_test shape: (25,)

y_train shape: (200,)
y_val shape: (25,)
y_test shape: (25,)

Validation Set Sample:
 135    [paean, wife, headlin, bitcoin, blockchain, se...
205    [live, world, realiti, time, travel, appear, o...
90     [cedar, bayou, fm, num, clear, creek, dayton, ...
95     [democrat, hous, select, committe, benghazi, s...
84     [russia, respond, us, provoc, open, sky, treat...
Name: stemmed_tokens, dtype: object

Test Set Sample:
 174    [benzalkonium, chlorid, industri, num, global,...
41     [excerpt, carl, sagan, cosmo, specif, episod, ...
108    [give, soro, group, million, destabil, macedon...
115    [donald, trump, part, new, world, order, quest...
47     [drain, swamp, matter, long, groupthink, persi...
Name: stemmed_tokens, dtype: object
