In [None]:
"""
This code block imports several libraries and modules that are commonly used for data manipulation, text processing, and visualization in Python.

1. `pandas`: A powerful data manipulation and analysis library that provides data structures like DataFrames for handling structured data.
2. `re`: A module for working with regular expressions, which are used for string searching and manipulation.
3. `nltk`: The Natural Language Toolkit, a library for working with human language data (text). It provides tools for text processing, including tokenization, stopword removal, and stemming.
   - `word_tokenize`: A function from NLTK that splits text into individual words (tokens).
   - `stopwords`: A corpus from NLTK that contains common words (like 'and', 'the', etc.) that are often filtered out in text processing.
   - `PorterStemmer`: A stemming algorithm that reduces words to their base or root form.
4. `matplotlib.pyplot`: A plotting library used for creating static, animated, and interactive visualizations in Python.

The commented-out lines at the end indicate that the user may need to download specific NLTK resources (stopwords and punkt tokenizer) for the code to function properly. These lines are currently inactive but can be uncommented to download the necessary datasets.
"""
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt

#nltk.download('stopwords')
#nltk.download('punkt_tab')

In [None]:
"""
This code block defines functions and initializes variables for cleaning and processing text data, particularly for natural language processing (NLP) tasks.

1. `stop_words`: A set containing English stopwords loaded from the NLTK library. Stopwords are common words that are often removed from text during processing to focus on more meaningful words.

2. `stemmer`: An instance of the PorterStemmer class from NLTK, which is used to reduce words to their root form (stemming).

3. `clean_text(text)`: A function that takes a string of text as input and performs several cleaning operations:
   - Converts the text to lowercase to ensure uniformity.
   - Uses a regular expression to replace multiple whitespace characters with a single space.
   - Replaces email addresses in the text with the placeholder '_EMAIL_'.
   - Replaces URLs in the text with the placeholder '_URL_'.
   - Replaces dates in various formats with the placeholder '_DATE_' using a verbose regular expression.
   - Replaces remaining numbers with the placeholder '_NUM_'.
   - Returns the cleaned text.

4. `remove_stopwords(tokens)`: A function that takes a list of tokens (words) as input and returns a new list containing only those words that are not in the `stop_words` set. This is useful for filtering out less meaningful words from the text.

5. `stem_words(tokens)`: A function that takes a list of tokens as input and returns a new list where each word has been stemmed using the Porter stemmer. This reduces words to their base form, which can help in standardizing the text for analysis.

Overall, this code is designed to preprocess text data by cleaning it, removing common stopwords, and stemming the remaining words, making it suitable for further analysis or modeling in NLP tasks.
"""
stop_words = set(stopwords.words("english"))                        # load English stopwords from NLTK
stemmer = PorterStemmer()                                           # create a new Porter stemmer

def clean_text(text):
    text = text.lower()                                             # convert to lowercase
    spaces = re.compile(r'\s+')
    text = spaces.sub(' ', text)                                    # substitute all white space characters (single or multiple occurences) with a single space

    emails = re.compile(r'\S+@\S+\.\S+')
    text = emails.sub('_EMAIL_', text)                              # substitute all found email addresses with _EMAIL_
    urls = re.compile(r'http[s]?:\/\/\S+|www\.\S+|\S+\.[a-z]+\/\S+|\w+\.(?:com|net|org)')
    text = urls.sub('_URL_', text)                                  # substitute all found URLs with _URL_
    dates = re.compile(r'''
                       \d{1,4}[-\/]\d{1,2}[-\/]\d{1,4}|
                       \d{1,2}\ (?:jan[a-z]*|feb[a-z]*|mar[a-z]*|apr[a-z]*|may|jun[e]?|jul[y]?|aug[a-z]*|sep[a-z]*|oct[a-z]*|nov[a-z]*|dec[a-z]*)\ \d{,4}|
                       (?:jan[a-z]*|feb[a-z]*|mar[a-z]*|apr[a-z]*|may|jun[e]?|jul[y]?|aug[a-z]*|sep[a-z]*|oct[a-z]*|nov[a-z]*|dec[a-z]*)[,.]?\ ?\d{1,4}(?:th|st|nd|rd)?(?:,\ \d{4})?
                       ''', re.VERBOSE)
    text = dates.sub('_DATE_', text)                                # substitute all found dates with _DATE_
    numbers = re.compile(r'\d+(?:th|st|nd|rd)?')
    text = numbers.sub('_NUM_', text)                               # substitute all remaining numbers with _NUM_
    return text

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

def stem_words(tokens):
    return [stemmer.stem(word) for word in tokens]

In [None]:
"""
This code block performs data loading and preprocessing on a CSV file containing news articles. It utilizes the pandas library for data manipulation.

1. `sample = pd.read_csv("news_sample.csv", usecols=['domain', 'type', 'url', 'content', 'title'])`: 
   - This line reads a CSV file named "news_sample.csv" into a pandas DataFrame called `sample`. 
   - The `usecols` parameter specifies that only the columns 'domain', 'type', 'url', 'content', and 'title' should be loaded from the CSV file.

2. `print(sample.info())`: 
   - This line prints a summary of the DataFrame, including the number of entries, column names, non-null counts, and data types. This helps in understanding the structure and completeness of the data.

3. `sample = sample.dropna(subset=['content', 'type'])`: 
   - This line removes any rows from the DataFrame that have missing values (NaN) in the 'content' or 'type' columns. This is important for ensuring that the dataset is complete for analysis.

4. `sample = sample.drop(sample[sample['type'] == 'unknown'].index)`: 
   - This line drops any rows where the 'type' column has the value 'unknown'. This helps in cleaning the dataset by removing entries that do not have a clear classification.

5. `sample = sample.drop(sample[sample['type'] == 'unreliable'].index)`: 
   - Similar to the previous line, this line removes rows where the 'type' is 'unreliable', further refining the dataset to include only reliable entries.

6. `sample = sample.drop_duplicates(subset=['content'])`: 
   - This line removes any duplicate rows based on the 'content' column. This ensures that each piece of content in the dataset is unique, which is important for analysis and modeling.

7. `print(sample.info())`: 
   - Finally, this line prints the updated summary of the DataFrame after the preprocessing steps, allowing the user to verify the changes made to the dataset.

Overall, this code block is focused on loading a sample of news data, cleaning it by removing rows with missing or unreliable information, and ensuring that the content is unique for further analysis.
"""

sample = pd.read_csv("news_sample.csv", usecols=['domain', 'type', 'url', 'content', 'title']) 
print(sample.info()) 

sample = sample.dropna(subset=['content', 'type'])                                    # drop rows with no content or type (/label)
sample = sample.drop(sample[sample['type'] == 'unknown'].index)                       # drop rows where 'type' is 'unknown'
sample = sample.drop(sample[sample['type'] == 'unreliable'].index)                    # drop rows where 'type' is 'unreliable'
sample = sample.drop_duplicates(subset=['content'])                                   # drop rows with duplicates in the 'content' column

print(sample.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   domain   250 non-null    object
 1   type     238 non-null    object
 2   url      250 non-null    object
 3   content  250 non-null    object
 4   title    250 non-null    object
dtypes: object(5)
memory usage: 9.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 215 entries, 1 to 246
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   domain   215 non-null    object
 1   type     215 non-null    object
 2   url      215 non-null    object
 3   content  215 non-null    object
 4   title    215 non-null    object
dtypes: object(5)
memory usage: 10.1+ KB
None


In [None]:
"""
This code block performs a series of text preprocessing steps on the 'content' column of the DataFrame `sample`. It utilizes previously defined functions to clean, tokenize, remove stopwords, and stem the text data.

1. `sample["cleaned_content"] = sample["content"].apply(clean_text)`:
   - This line applies the `clean_text` function to each entry in the 'content' column of the DataFrame. 
   - The cleaned text is stored in a new column called 'cleaned_content', which contains the text after performing operations such as converting to lowercase, removing emails, URLs, dates, and numbers.

2. `sample["tokens"] = sample["cleaned_content"].apply(word_tokenize)`:
   - This line tokenizes the cleaned text in the 'cleaned_content' column using the `word_tokenize` function from NLTK. 
   - The resulting list of tokens (individual words) is stored in a new column called 'tokens'.

3. `sample["tokens_no_stopwords"] = sample["tokens"].apply(remove_stopwords)`:
   - This line applies the `remove_stopwords` function to the 'tokens' column, filtering out common stopwords from the list of tokens. 
   - The resulting list of tokens without stopwords is stored in a new column called 'tokens_no_stopwords'.

4. `sample["stemmed_tokens"] = sample["tokens_no_stopwords"].apply(stem_words)`:
   - This line applies the `stem_words` function to the 'tokens_no_stopwords' column, stemming each token to its root form. 
   - The stemmed tokens are stored in a new column called 'stemmed_tokens'.

5. `print(sample[["content", "cleaned_content", "tokens", "tokens_no_stopwords", "stemmed_tokens"]].head())`:
   - This line prints a preview of the first few rows of the DataFrame, displaying the original 'content' along with the intermediate steps of the preprocessing pipeline: 'cleaned_content', 'tokens', 'tokens_no_stopwords', and 'stemmed_tokens'.
   - This allows the user to visually inspect the transformations applied to the text data at each stage of the preprocessing process.

Overall, this code block systematically processes the text data in the 'content' column, transforming it through cleaning, tokenization, stopword removal, and stemming, while providing a preview of the results at each step.
"""
sample["cleaned_content"] = sample["content"].apply(clean_text)                     # cleaning the text in the content column
sample["tokens"] = sample["cleaned_content"].apply(word_tokenize)                   # tokenizing the text in the content column
sample["tokens_no_stopwords"] = sample["tokens"].apply(remove_stopwords)            # removing stopwords from the tokens 
sample["stemmed_tokens"] = sample["tokens_no_stopwords"].apply(stem_words)          # stemming the tokens 
 
print(sample[["content", "cleaned_content", "tokens", "tokens_no_stopwords", "stemmed_tokens"]].head())

                                             content  \
1  AWAKENING OF 12 STRANDS of DNA – “Reconnecting...   
4  Donald Trump has the unnerving ability to abil...   
6  Could you imagine waking up in the morgue? I f...   
7  Citizen Journalist\n\nby N.Morgan Q has releas...   
8  Usa Dollar Tanks On Mnuchin Statement That He ...   

                                     cleaned_content  \
1  awakening of _NUM_ strands of dna – “reconnect...   
4  donald trump has the unnerving ability to abil...   
6  could you imagine waking up in the morgue? i f...   
7  citizen journalist by n.morgan q has released ...   
8  usa dollar tanks on mnuchin statement that he ...   

                                              tokens  \
1  [awakening, of, _NUM_, strands, of, dna, –, “,...   
4  [donald, trump, has, the, unnerving, ability, ...   
6  [could, you, imagine, waking, up, in, the, mor...   
7  [citizen, journalist, by, n.morgan, q, has, re...   
8  [usa, dollar, tanks, on, mnuchin, statement

In [None]:
"""
This code block calculates and reports various statistics related to the vocabulary of the text data in the DataFrame `sample`. It focuses on the effects of stopword removal and stemming on the vocabulary size.

1. `size_tokenized = len(set(word for doc in sample["tokens"] for word in doc))`:
   - This line computes the size of the vocabulary (unique words) from the 'tokens' column, which contains the tokenized text. 
   - It uses a set comprehension to collect unique words across all documents and calculates the total number of unique tokens, storing the result in `size_tokenized`.

2. `size_wo_stopwords = len(set(word for doc in sample["tokens_no_stopwords"] for word in doc))`:
   - This line calculates the size of the vocabulary after removing stopwords by applying the same logic to the 'tokens_no_stopwords' column. 
   - The result, representing the number of unique words without stopwords, is stored in `size_wo_stopwords`.

3. `size_stemmed = len(set(word for doc in sample["stemmed_tokens"] for word in doc))`:
   - This line computes the size of the vocabulary after stemming by applying the same logic to the 'stemmed_tokens' column. 
   - The number of unique stemmed words is stored in `size_stemmed`.

4. `stopword_reduction_rate = (size_tokenized - size_wo_stopwords) / size_tokenized * 100`:
   - This line calculates the reduction rate in vocabulary size due to stopword removal. 
   - It computes the percentage decrease in unique words by comparing the vocabulary size before and after stopword removal, storing the result in `stopword_reduction_rate`.

5. `stemmed_reduction_rate = (size_wo_stopwords - size_stemmed) / size_wo_stopwords * 100`:
   - This line calculates the reduction rate in vocabulary size due to stemming. 
   - It computes the percentage decrease in unique words by comparing the vocabulary size before and after stemming, storing the result in `stemmed_reduction_rate`.

6. The following print statements output the calculated statistics:
   - `print(f"Vocabulary before removing stopwords: {size_tokenized}")`: Displays the number of unique tokens before stopword removal.
   - `print(f"Vocabulary after removing stopwords: {size_wo_stopwords}")`: Displays the number of unique tokens after stopword removal.
   - `print(f"Reduction rate after removing stopwords: {stopword_reduction_rate:.2f}%")`: Displays the percentage reduction in vocabulary size due to stopword removal.
   - `print(f"Vocabulary after stemming: {size_stemmed}")`: Displays the number of unique stemmed tokens.
   - `print(f"Reduction rate after stemming: {stemmed_reduction_rate:.2f}%")`: Displays the percentage reduction in vocabulary size due to stemming.

Overall, this code block provides insights into the impact of text preprocessing techniques (stopword removal and stemming) on the vocabulary size of the dataset, helping to understand how these techniques affect the richness of the text data.
"""
size_tokenized = len(set(word for doc in sample["tokens"] for word in doc))
size_wo_stopwords = len(set(word for doc in sample["tokens_no_stopwords"] for word in doc))
size_stemmed = len(set(word for doc in sample["stemmed_tokens"] for word in doc))

stopword_reduction_rate = (size_tokenized - size_wo_stopwords) / size_tokenized * 100
stemmed_reduction_rate = (size_wo_stopwords - size_stemmed) / size_wo_stopwords * 100

print(f"Vocabulary before removing stopwords: {size_tokenized}")
print(f"Vocabulary after removing stopwords: {size_wo_stopwords}")
print(f"Reduction rate after removing stopwords: {stopword_reduction_rate:.2f}%")

print(f"Vocabulary after stemming: {size_stemmed}")
print(f"Reduction rate after stemming: {stemmed_reduction_rate:.2f}%")

Vocabulary before removing stopwords: 15585
Vocabulary after removing stopwords: 15440
Reduction rate after removing stopwords: 0.93%
Vocabulary after stemming: 10676
Reduction rate after stemming: 30.85%
