# Part 1: Data Processing

## Task 1: Preprocessing 'news_sample.csv'

In [None]:
import pandas as pd
import lib.process_a as process_a
import lib.process_c as process_c

# load 'news_sample.csv' file from git source
df_sample = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv')

# Apply preprocess to dataframe: cleanup -> remove stopword -> stemming
# process_a.preprocess(df_sample)
process_c.preprocess(df_sample)

# save csv file copy of preprocessed dataframe
df_sample.to_csv("data/news_sample_cleaned.csv")

# pd.reset_option('display.max_rows')
# pd.set_option('display.max_colwidth', 150)


### Vocabulary sizes and reduction rates

The vocabulary sizes and reduction rates are computed in the preprocess method c. The results are stored directly to the input dataframe.

## Task 2:

### Make at least three non-trivial observations/discoveries about the data

In [None]:
import pandas as pd

# Load the dataset
df_new = pd.read_csv('data/995,000_rows.csv')
# df_new = pd.read_csv('data/news_sample.csv')


Observations of domains

In [None]:
import lib.process_a as process_a
# print("Basic Statistics of the Dataset:")
# print(df_new.info())

# Filter the DataFrame for 'reliable' and 'fake' types
reliable_domains = set(df_new[df_new['type'] == 'reliable']['domain'].unique())
fake_domains = set(df_new[df_new['type'] == 'fake']['domain'].unique())

# Domains in 'reliable' but not in 'fake'
reliable_not_fake_domains = reliable_domains - fake_domains

# Domains in 'fake' but not in 'reliable'
fake_not_reliable_domains = fake_domains - reliable_domains

# Find the intersection of unique domains
common_domains = fake_domains.intersection(reliable_domains)

print("Domains in 'reliable' but not in 'fake':")
print(reliable_not_fake_domains)

print("\nDomains in 'fake' but not in 'reliable':")
print(fake_not_reliable_domains)

print("\nDomains in both 'fake' and 'reliable':")
print(common_domains)

Clean text

In [None]:
import lib.process_methods as pm
import swifter

# Drop rows where either 'type' or 'content' is NaN
df_new.dropna(subset=['type', 'content'], inplace=True)

# Apply clean_text function
df_new['content_clean'] = df_new['content'].swifter.apply(pm.clean_text)

Function to count URLs with <URL> tag

In [None]:
import re

# Function to count URLs with <URL> tag
def count_urls_with_tag(text):
    urls_with_tag = re.findall('_url_', text)
    return len(urls_with_tag)

# Apply count URLs with <URL> tag
df_new['url_count'] = df_new['content_clean'].apply(count_urls_with_tag)

# print(df[['content_clean', 'url_count_with_tag']])

# Total URLs in Content
total_urls = df_new['url_count'].sum()
print("Total URLs in Content:", total_urls)

# Filter DataFrame for articles labeled as 'fake'
fake_articles = df_new[df_new['type'] == 'fake']

# add more type
#fake_articles = df_new[(df_new['type'] == 'fake') | (df_new['type'] == '')]

# Total URLs in 'fake' content
total_fake_urls = fake_articles['url_count'].sum()
print("Total URLs in 'fake' content:", total_fake_urls)

# Filter DataFrame for articles labeled as 'reliable'
reliable_articles = df_new[df_new['type'] == 'reliable']

# Total URLs in 'reliable' content
total_reliable_urls = reliable_articles['url_count'].sum()
print("Total URLs in 'reliable' content:", total_reliable_urls)

# Minimum number of URLs in 'fake' content
min_fake_urls = fake_articles['url_count'].min()
print("Minimum number of URLs in 'fake' content:", min_fake_urls)

# Maximum number of URLs in 'fake' content
max_fake_urls = fake_articles['url_count'].max()
print("Maximum number of URLs in 'fake' content:", max_fake_urls)

# Mean number of URLs in 'fake' content
mean_fake_urls = fake_articles['url_count'].mean()
print("Mean number of URLs in 'fake' content:", mean_fake_urls)

# Minimum number of URLs in 'reliable' content
min_reliable_urls = reliable_articles['url_count'].min()
print("Minimum number of URLs in 'reliable' content:", min_reliable_urls)

# Maximum number of URLs in 'reliable' content
max_reliable_urls = reliable_articles['url_count'].max()
print("Maximum number of URLs in 'reliable' content:", max_reliable_urls)

# Mean number of URLs in 'reliable' content
mean_reliable_urls = reliable_articles['url_count'].mean()
print("Mean number of URLs in 'reliable' content:", mean_reliable_urls)


Function to count DATEs with <DATE> tag

In [None]:
# Function to count DATEs with <DATE> tag
def count_date_with_tag(text):
    date_with_tag = re.findall('_date_', text)
    return len(date_with_tag)

# Apply count DATEs with <DATE> tag
df_new['date_count'] = df_new['content_clean'].apply(count_date_with_tag)

# Total DATEs in Content
total_dates = df_new['date_count'].sum()
print("Total DATEs in Content:", total_dates)

# Filter DataFrame for articles labeled as 'fake'
fake_articles = df_new[df_new['type'] == 'fake']

# add more type
#fake_articles = df_new[(df_new['type'] == 'fake') | (df_new['type'] == '')]

# Total DATEs in 'fake' content
total_fake_dates = fake_articles['date_count'].sum()
print("Total DATEs in 'fake' content:", total_fake_dates)

# Filter DataFrame for articles labeled as 'reliable'
reliable_articles = df_new[df_new['type'] == 'reliable']

# Total URLs in 'reliable' content
total_reliable_dates = reliable_articles['date_count'].sum()
print("Total DATEs in 'reliable' content:", total_reliable_dates)

# Minimum number of DATEs in 'fake' content
min_fake_dates = fake_articles['date_count'].min()
print("Minimum number of DATEs in 'fake' content:", min_fake_dates)

# Maximum number of DATEs in 'fake' content
max_fake_dates = fake_articles['date_count'].max()
print("Maximum number of DATEs in 'fake' content:", max_fake_dates)

# Mean number of URLs in 'fake' content
mean_fake_dates = fake_articles['date_count'].mean()
print("Mean number of DATEs in 'fake' content:", mean_fake_dates)

# Minimum number of URLs in 'reliable' content
min_reliable_dates = reliable_articles['date_count'].min()
print("Minimum number of DATEs in 'reliable' content:", min_reliable_dates)

# Maximum number of URLs in 'reliable' content
max_reliable_dates = reliable_articles['date_count'].max()
print("Maximum number of DATEs in 'reliable' content:", max_reliable_dates)

# Mean number of URLs in 'reliable' content
mean_reliable_dates = reliable_articles['date_count'].mean()
print("Mean number of DATEs in 'reliable' content:", mean_reliable_dates)

Function to count NUMs with <NUM> tag

In [None]:

# Function to count NUMs with <NUM> tag
def count_num_with_tag(text):
    num_with_tag = re.findall('_num_', text)
    return len(num_with_tag)

# Apply count NUms with <NUM> tag
df_new['num_count'] = df_new['content_clean'].apply(count_num_with_tag)

# Total NUMs in Content
total_nums = df_new['num_count'].sum()
print("Total NUMs in Content:", total_nums)

# Filter DataFrame for articles labeled as 'fake'
fake_articles = df_new[df_new['type'] == 'fake']

# add more type
#fake_articles = df_new[(df_new['type'] == 'fake') | (df_new['type'] == '')]

# Total NUMs in 'fake' content
total_fake_nums = fake_articles['num_count'].sum()
print("Total NUMs in 'fake' content:", total_fake_nums)

# Filter DataFrame for articles labeled as 'reliable'
reliable_articles = df_new[df_new['type'] == 'reliable']

# Total NUMs in 'reliable' content
total_reliable_nums = reliable_articles['num_count'].sum()
print("Total NUMs in 'reliable' content:", total_reliable_nums)

# Minimum number of NUMs in 'fake' content
min_fake_nums = fake_articles['num_count'].min()
print("Minimum number of NUMs in 'fake' content:", min_fake_nums)

# Maximum number of NUMs in 'fake' content
max_fake_nums = fake_articles['num_count'].max()
print("Maximum number of NUMs in 'fake' content:", max_fake_nums)

# Mean number of NUMs in 'fake' content
mean_fake_nums = fake_articles['num_count'].mean()
print("Mean number of NUMs in 'fake' content:", mean_fake_nums)

# Minimum number of URLs in 'reliable' content
min_reliable_nums = reliable_articles['num_count'].min()
print("Minimum number of NUMs in 'reliable' content:", min_reliable_nums)

# Maximum number of URLs in 'reliable' content
max_reliable_nums = reliable_articles['num_count'].max()
print("Maximum number of NUMs in 'reliable' content:", max_reliable_nums)

# Mean number of URLs in 'reliable' content
mean_reliable_nums = reliable_articles['num_count'].mean()
print("Mean number of NUMs in 'reliable' content:", mean_reliable_nums)


Barplot for top 10000 most frequent clean words

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Initialize an empty counter to store word frequencies
clean_word_freq = Counter()

# Iterate over each row of the DataFrame
for _, row in df_new.iterrows():
    # Join the clean words in the 'content_clean' column of the current row into a single string
    clean_text = ' '.join(re.findall(r'\b\w+\b', row['content_clean']))
    
    # Count the word frequencies for the current row
    clean_word_freq.update(clean_text.split())

# Sort the word frequencies in descending order
sorted_clean_word_freq = sorted(clean_word_freq.items(), key=lambda x: x[1], reverse=True)

# Extract the 100 most frequent words
top_100_clean_words = sorted_clean_word_freq[:100]

# Print the top 100 most frequent words
print(top_100_clean_words)

# Extract the 10000 most frequent words
top_10000_clean_words = sorted_clean_word_freq[:10000]

# Barplot for top 10000 most frequent clean words
plt.figure(figsize=(15, 6))
words, frequencies = zip(*top_10000_clean_words)
plt.bar(words, frequencies)
plt.yscale('log')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10000 Most Frequent Clean Words')
plt.xticks([])
plt.show()

Remove stopwords

In [None]:
import lib.process_methods as pm
import swifter
# Apply remove_stopwords to 'content_clean' column and create 'content_stopword' column
df_new['content_stopword'] = df_new['content_clean'].swifter.apply(pm.remove_stopwords)

Stemming

In [None]:
import lib.process_methods as pm
import swifter
df_new['content_stem'] = df_new['content_stopword'].swifter.apply(pm.remove_word_variations)

Save to new file

In [None]:
df_new.to_csv('data/995,000_rows_cleaned.csv')

In [None]:
# Missing Values
print("\nMissing Values in Each Column:")
print(df_new.isnull().sum())

Barplot for top 10000 most frequent clean words after preprocessing

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Initialize an empty counter to store word frequencies
clean_word_freq_after = Counter()

# Iterate over each row of the DataFrame
for _, row in df_new.iterrows():
    # Join the clean words in the 'content_clean' column of the current row into a single string
    clean_text_after = ' '.join(re.findall(r'\b\w+\b', row['content_stem']))
    
    # Count the word frequencies for the current row
    clean_word_freq_after.update(clean_text_after.split())

# Sort in descending order
sorted_clean_word_freq_after = sorted(clean_word_freq_after.items(), key=lambda x: x[1], reverse=True)

# Extract the 100 most frequent words
top_100_clean_words_after = sorted_clean_word_freq_after[:100]

for word, frequency in top_100_clean_words_after:
    print(f"{word}: {frequency}")

# Extract the 10000 most frequent words
top_10000_clean_words_after = sorted_clean_word_freq_after[:10000]

# Barplot for top 10000 most frequent clean words
plt.figure(figsize=(15, 6))
words, frequencies = zip(*top_10000_clean_words_after)
plt.bar(words, frequencies)
plt.yscale('log')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10000 Most Frequent Clean Words After Preprocessing')
plt.xticks([])
plt.show()

## Task 3: Apply preprocess to '995,000_rows.csv' dataset

In [None]:
import pandas as pd
import lib.process_b as process_b

src = 'data/995,000_rows.csv'
# src = 'data/995,000_rows_SAMPLE.csv'
# src = 'data/news_sample.csv'
dst = src[0:-4] + '_cleaned.csv'

# preprocess
df = pd.read_csv(src)
process_b.preprocess(df)

# save csv file copy of preprocessed dataframe
df.to_csv(dst)

## Task 4: Split the dataset into a training, validation, and test splits

In [None]:
import pandas as pd
import lib.split as split

src = 'data/995,000_rows.csv'
# src = 'data/995,000_rows_SAMPLE.csv'
# src = 'data/news_sample.csv'

df = pd.read_csv(src)
split.eighty_ten_ten(df)