In [None]:
import pandas as pd
import re
import ast
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download necessary NLTK resources
#nltk.download("stopwords")
#nltk.download('punkt_tab')

In [None]:
"""
This code block initializes the stopwords and stemmer used for text preprocessing in natural language processing (NLP) tasks.

1. `stop_words = set(stopwords.words("english"))`: 
   - This line loads the English stopwords from the NLTK library and converts them into a set. 
   - Stopwords are common words (such as 'and', 'the', 'is', etc.) that are often removed from text during preprocessing because they carry less meaningful information. 
   - By using a set, the lookup time for checking if a word is a stopword is optimized, as sets provide average O(1) time complexity for membership tests.

2. `stemmer = PorterStemmer()`: 
   - This line creates an instance of the `PorterStemmer` class from the NLTK library. 
   - The Porter stemmer is an algorithm that reduces words to their root form (or stem) by removing suffixes. 
   - This process helps in standardizing words for analysis, as different forms of a word (e.g., 'running', 'ran', 'runs') can be reduced to a common base form (e.g., 'run').

Overall, this code block sets up the necessary components for text preprocessing, specifically for removing common stopwords and stemming words to their root forms, which are essential steps in preparing text data for further analysis or modeling.
"""

stop_words = set(stopwords.words("english"))                        # load English stopwords from NLTK
stemmer = PorterStemmer()                                           # create a new Porter stemmer 

In [3]:
# Define text preprocessing functions
def clean_text(text):
    text = text.lower()                                             # convert to lowercase
    spaces = re.compile(r'\s+')
    text = spaces.sub(' ', text)                                    # substitute all white space characters (single or multiple occurences) with a single space

    emails = re.compile(r'\S+@\S+\.\S+')
    text = emails.sub('_EMAIL_', text)                              # substitute all found email addresses with _EMAIL_
    urls = re.compile(r'http[s]?:\/\/\S+|www\.\S+|\S+\.[a-z]+\/\S+|\w+\.(?:com|net|org)')
    text = urls.sub('_URL_', text)                                  # substitute all found URLs with _URL_
    dates = re.compile(r'''
                       \d{1,4}[-\/]\d{1,2}[-\/]\d{1,4}|
                       \d{1,2}\ (?:jan[a-z]*|feb[a-z]*|mar[a-z]*|apr[a-z]*|may|jun[e]?|jul[y]?|aug[a-z]*|sep[a-z]*|oct[a-z]*|nov[a-z]*|dec[a-z]*)\ \d{,4}|
                       (?:jan[a-z]*|feb[a-z]*|mar[a-z]*|apr[a-z]*|may|jun[e]?|jul[y]?|aug[a-z]*|sep[a-z]*|oct[a-z]*|nov[a-z]*|dec[a-z]*)[,.]?\ ?\d{1,4}(?:th|st|nd|rd)?(?:,\ \d{4})?
                       ''', re.VERBOSE)
    text = dates.sub('_DATE_', text)                                # substitute all found dates with _DATE_
    numbers = re.compile(r'\d+(?:th|st|nd|rd)?')
    text = numbers.sub('_NUM_', text)                               # substitute all remaining numbers with _NUM_
    return text

def remove_stopwords_and_stem(tokens):
    return [stemmer.stem(word) for word in tokens if word not in stop_words]

In [9]:
# File paths
input_file = "995,000_rows.csv"
processed_file = "processed_fake_news.csv"

In [None]:
'''This code block reads a CSV file in chunks, processes each chunk to clean and preprocess the text data, and then saves the processed data to a new CSV file. It utilizes the pandas library for data manipulation and assumes the presence of a text preprocessing pipeline.
'''
# Read and process in chunks
reader = pd.read_csv(input_file, usecols=['domain', 'type', 'url', 'content', 'title'], chunksize=10000)
first_chunk = True  # Track the first chunk for writing header

for chunk in reader:
    chunk.dropna(subset=['content', 'type'], inplace=True)                                    # drop rows with no content or type (/label)
    chunk.drop(chunk[chunk['type'] == 'unknown'].index, inplace=True)                         # drop rows where 'type' is 'unknown'
    chunk.drop(chunk[chunk['type'] == 'unreliable'].index, inplace=True)                      # drop rows where 'type' is 'unreliable'
    chunk.drop_duplicates(subset=['content'], inplace=True)                                   # drop rows with duplicates in the 'content' column

    # Apply text preprocessing pipeline
    chunk["content"] = chunk["content"].apply(clean_text)                               # cleaning the text in the content column
    chunk["content"] = chunk["content"].apply(word_tokenize)                             # tokenizing the text in the content column
    chunk["stemmed_tokens"] = chunk["content"].apply(remove_stopwords_and_stem)          # removing stopwords and stemming the tokens 

    # Write and save processed chunk to csv file
    chunk.to_csv(processed_file, mode="w" if first_chunk else "a", index=False, header=first_chunk)
    first_chunk = False # Ensure only the first chunk writes header

    print(f"Processed and saved a chunk of {len(chunk)} rows.")

print("Processing complete! Data saved to:", processed_file)

Processed and saved a chunk of 7903 rows.
Processed and saved a chunk of 7950 rows.
Processed and saved a chunk of 7888 rows.
Processed and saved a chunk of 7722 rows.
Processed and saved a chunk of 7840 rows.
Processed and saved a chunk of 7977 rows.
Processed and saved a chunk of 7891 rows.
Processed and saved a chunk of 7894 rows.
Processed and saved a chunk of 7948 rows.
Processed and saved a chunk of 7901 rows.
Processed and saved a chunk of 7685 rows.
Processed and saved a chunk of 7697 rows.
Processed and saved a chunk of 7776 rows.
Processed and saved a chunk of 7905 rows.
Processed and saved a chunk of 8017 rows.
Processed and saved a chunk of 7810 rows.
Processed and saved a chunk of 7801 rows.
Processed and saved a chunk of 7888 rows.
Processed and saved a chunk of 7867 rows.
Processed and saved a chunk of 7454 rows.
Processed and saved a chunk of 7467 rows.
Processed and saved a chunk of 7579 rows.
Processed and saved a chunk of 7622 rows.
Processed and saved a chunk of 739

In [2]:
# File paths
scraped_data = "CBS_bbc_scraped_articles.csv"
processed_data = "processed_scraped_articles.csv"

In [None]:
"""
This code block reads, processes, and saves scraped text data from a CSV file. It performs data cleaning, text preprocessing, and prepares the data for further analysis or modeling.

1. `scraped_df = pd.read_csv(scraped_data, usecols=['text'])`:
   - This line reads the scraped data from a CSV file specified by `scraped_data`, loading only the 'text' column into a pandas DataFrame called `scraped_df`.

2. `scraped_df.dropna(subset=['text'], inplace=True)`:
   - This line removes any rows from the DataFrame that have missing values (NaN) in the 'text' column, ensuring that only complete entries are processed.

3. `scraped_df.drop_duplicates(subset=['text'], inplace=True)`:
   - This line removes any duplicate rows based on the 'text' column, ensuring that each piece of text is unique.

4. `scraped_df['text'] = scraped_df['text'].apply(clean_text)`:
   - This line applies the `clean_text` function to the 'text' column, cleaning the text data by performing operations such as lowercasing, removing unwanted characters, and other preprocessing steps.

5. `scraped_df['text'] = scraped_df['text'].apply(word_tokenize)`:
   - This line tokenizes the cleaned text in the 'text' column, splitting it into individual words (tokens).

6. `scraped_df['stemmed_tokens'] = scraped_df['text'].apply(remove_stopwords_and_stem)`:
   - This line applies a function (assumed to be defined elsewhere) that removes stopwords and stems the tokens in the 'text' column, storing the results in a new column called 'stemmed_tokens'.

7. `scraped_df['type'] = 0`:
   - This line adds a new column called 'type' to the DataFrame, assigning a value of 0 (indicating 'reliable') to all rows. This could be used for classification purposes later.

8. `scraped_df['stemmed_tokens'] = scraped_df['stemmed_tokens'].apply(lambda x: ' '.join(x))`:
   - This line converts the lists of stemmed tokens in the 'stemmed_tokens' column into single strings, joining the tokens with spaces. This format is often required for text analysis tools like CountVectorizer.
Overall, this code block efficiently processes scraped text data by cleaning, tokenizing, and stemming the text, while also preparing it for further analysis or modeling by saving it in a structured format.
"""

scraped_df = pd.read_csv(scraped_data, usecols=['text'])

scraped_df.dropna(subset=['text'], inplace=True)                                    # drop rows with no text
scraped_df.drop_duplicates(subset=['text'], inplace=True)                           # drop rows with duplicates in the 'text' column

# Apply text preprocessing pipeline
scraped_df['text'] = scraped_df['text'].apply(clean_text)                                   # cleaning the text in the text column
scraped_df['text'] = scraped_df['text'].apply(word_tokenize)                                # tokenizing the text in the text column
scraped_df['stemmed_tokens'] = scraped_df['text'].apply(remove_stopwords_and_stem)          # removing stopwords and stemming the tokens 

# Add 'type' column with value 0 (reliable) for all rows
scraped_df['type'] = 0

# Convert data in 'stemmed_tokens' column from list of strings
# to a single string pr. article (as this is what e.g. CountVectorizer expects as input)
print(type(scraped_df['stemmed_tokens'][0]))
print(scraped_df['stemmed_tokens'].head(2))
scraped_df['stemmed_tokens'] = scraped_df['stemmed_tokens'].apply(lambda x: ' '.join(x))
print(type(scraped_df['stemmed_tokens'][0]))
print(scraped_df['stemmed_tokens'].head(2))

# Write and save processed data to csv file
scraped_df.to_csv(processed_data, columns=['stemmed_tokens', 'type'], mode="w", index=False, header=True)

print("Processing complete! Data saved to:", processed_data)


<class 'list'>
0    [almost, _num_, ,, _num_, motorist, republ, ir...
1    [freight, train, carri, highli, toxic, chemic,...
Name: stemmed_tokens, dtype: object
<class 'str'>
0    almost _num_ , _num_ motorist republ ireland s...
1    freight train carri highli toxic chemic benzen...
Name: stemmed_tokens, dtype: object
Processing complete! Data saved to: processed_scraped_articles.csv


# Evaluating baseline model on the LIAR Dataset

In [4]:
# File paths
liar_test = "test.tsv"
liar_processed = "liar_test.csv"

In [None]:
"""
This code block reads, processes, and saves test data from the LIAR dataset, which contains statements labeled as true or false. It performs text preprocessing, filters relevant labels, and prepares the data for further analysis or modeling.

1. `liar_df = pd.read_csv(liar_test, sep = '\t', usecols=[1,2], names=['label', 'statement'])`:
   - This line reads the LIAR dataset from a tab-separated values (TSV) file specified by `liar_test`. 
   - It loads only the second and third columns (index 1 and 2) and assigns them the names 'label' and 'statement', respectively.
2. `liar_df['statement'] = liar_df['statement'].apply(clean_text)`:
   - This line applies the `clean_text` function to the 'statement' column, cleaning the text data by performing operations such as lowercasing, removing unwanted characters, and other preprocessing steps.
3. `liar_df['statement'] = liar_df['statement'].apply(word_tokenize)`:
   - This line tokenizes the cleaned statements in the 'statement' column, splitting each statement into individual words (tokens).
4. `liar_df['stemmed_tokens'] = liar_df['statement'].apply(remove_stopwords_and_stem)`:
   - This line applies a function (assumed to be defined elsewhere) that removes stopwords and stems the tokens in the 'statement' column, storing the results in a new column called 'stemmed_tokens'.
5. `liar_df['stemmed_tokens'] = liar_df['stemmed_tokens'].apply(lambda x: ' '.join(x))`:
   - This line converts the lists of stemmed tokens in the 'stemmed_tokens' column into single strings, joining the tokens with spaces. This format is often required for text analysis tools like CountVectorizer.
6. The following print statements are used for debugging and verification:
   - `print(type(liar_df['stemmed_tokens'][0]))`: Displays the type of the first entry in the 'stemmed_tokens' column before conversion.
   - `print(liar_df['stemmed_tokens'].head(2))`: Displays the first two entries in the 'stemmed_tokens' column before conversion.
   - `print(type(liar_df['stemmed_tokens'][0]))`: Displays the type of the first entry in the 'stemmed_tokens' column after conversion.
   - `print(liar_df['stemmed_tokens'].head(2))`: Displays the first two entries in the 'stemmed_tokens' column after conversion.
7. `print(liar_df['label'].value_counts())`:
   - This line prints an overview of the counts of each unique label in the 'label' column, providing insight into the distribution of true and false statements.
8. `fake_lables = ['pants-fire', 'false']` and `reliable_lables = ['true', 'mostly-true']`:
   - These lines define lists of labels that categorize statements as 'fake' or 'reliable'.
9. `liar_df = liar_df[liar_df['label'].isin(fake_lables + reliable_lables)]`:
   - This line filters the DataFrame to keep only the rows with labels that are either in the `fake_lables` or `reliable_lables` lists, ensuring that only relevant statements are retained.
10. `liar_df['label'] = liar_df['label'].apply(lambda x: 1 if x in fake_lables else 0)`:
    - This line converts the 'label' column to numerical values, assigning a value of 1 for 'fake' labels and 0 for 'reliable' labels.
11. `print(liar_df['label'].value_counts())`:
    - This line prints the count of statements grouped as 'fake' or 'reliable', providing a summary of the processed data.
Overall, this code block efficiently processes test data from the LIAR dataset by cleaning, tokenizing, and stemming the statements, filtering relevant labels, and saving the processed data in a structured format for further analysis or modeling.
"""


liar_df = pd.read_csv(liar_test, sep = '\t', usecols=[1,2], names=['label', 'statement'])

# Apply text preprocessing pipeline
liar_df['statement'] = liar_df['statement'].apply(clean_text)                                   # cleaning the statements in the statement column
liar_df['statement'] = liar_df['statement'].apply(word_tokenize)                                # tokenizing the statements in the statement column
liar_df['stemmed_tokens'] = liar_df['statement'].apply(remove_stopwords_and_stem)               # removing stopwords and stemming the tokens 

# Convert data in 'stemmed_tokens' column from list of strings
# to a single string pr. article (as this is what e.g. CountVectorizer expects as input)
print(type(liar_df['stemmed_tokens'][0]))
print(liar_df['stemmed_tokens'].head(2))
liar_df['stemmed_tokens'] = liar_df['stemmed_tokens'].apply(lambda x: ' '.join(x))
print(type(liar_df['stemmed_tokens'][0]))
print(liar_df['stemmed_tokens'].head(2))


# Print overview of occuring labels 
print(liar_df['label'].value_counts())

fake_lables = ['pants-fire', 'false']
reliable_lables = ['true', 'mostly-true']

liar_df = liar_df[liar_df['label'].isin(fake_lables + reliable_lables)]                     # Keep only relevant labels
liar_df['label'] = liar_df['label'].apply(lambda x: 1 if x in fake_lables else 0)             # Convert the 'label' column to numerical values

print(liar_df['label'].value_counts())                                                      # Print the count of statements grouped as 'fake' or 'reliable'


# Write and save processed data to csv file
liar_df.to_csv(liar_processed, columns=['stemmed_tokens', 'label'], mode="w", index=False, header=True)

print("Processing complete! Data saved to:", liar_processed)

<class 'list'>
0    [build, wall, u.s.-mexico, border, take, liter...
1    [wisconsin, pace, doubl, number, layoff, year, .]
Name: stemmed_tokens, dtype: object
<class 'str'>
0    build wall u.s.-mexico border take liter year .
1          wisconsin pace doubl number layoff year .
Name: stemmed_tokens, dtype: object
label
half-true      265
false          249
mostly-true    241
barely-true    212
true           208
pants-fire      92
Name: count, dtype: int64
label
0    449
1    341
Name: count, dtype: int64
Processing complete! Data saved to: liar_test.csv
