# Part 1: Data Processing

## Task 1: Preprocessing 'news_sample.csv'

In [None]:
import pandas as pd
import lib.process_c as process_c

# load 'news_sample.csv' file from git source
df_sample = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv')

# Apply preprocess to dataframe: cleanup -> remove stopword -> stemming
# process_a.preprocess(df_sample)
process_c.preprocess(df_sample)

# save csv file copy of preprocessed dataframe
df_sample.to_csv("data/news_sample_cleaned.csv")

# pd.reset_option('display.max_rows')
# pd.set_option('display.max_colwidth', 150)


### Vocabulary sizes and reduction rates

The vocabulary sizes and reduction rates are computed in the preprocess method c. The results are stored directly to the input dataframe.

## Task 2:

### Make at least three non-trivial observations/discoveries about the data

In [None]:
import pandas as pd

# Load the dataset
df_new = pd.read_csv('data/995,000_rows.csv')


In [None]:
# print("Basic Statistics of the Dataset:")
# print(df.info())

# # Missing Values
# print("\nMissing Values in Each Column:")
# print(df.isnull().sum())

# Handle NaN values in the 'content' column
df_new['content'] = df_new['content'].fillna('')
      
# Apply clean_text function
df_new['content_clean'] = df_new['content'].apply(clean_text)

In [None]:
# Function to count URLs with <URL> tag
def count_urls_with_tag(text):
    urls_with_tag = re.findall('<url>', text)
    return len(urls_with_tag)

# Apply count URLs with <URL> tag
df_new['url_count'] = df_new['content_clean'].apply(count_urls_with_tag)

# print(df[['content_clean', 'url_count_with_tag']])

# Total URLs in Content
total_urls = df_new['url_count'].sum()
print("Total URLs in Content:", total_urls)

In [None]:
# Function to count DATEs with <DATE> tag
def count_date_with_tag(text):
    date_with_tag = re.findall('<date>', text)
    return len(date_with_tag)

# Apply count DATEs with <DATE> tag
df_new['date_count'] = df_new['content_clean'].apply(count_date_with_tag)

# Total DATEs in Content
total_dates = df_new['date_count'].sum()
print("Total DATEs in Content:", total_dates)

In [None]:

# Function to count NUMs with <NUM> tag
def count_num_with_tag(text):
    num_with_tag = re.findall('<num>', text)
    return len(num_with_tag)

# Apply count NUms with <NUM> tag
df_new['num_count'] = df_new['content_clean'].apply(count_num_with_tag)

# Total NUMs in Content
total_nums = df_new['num_count'].sum()
print("Total NUMs in Content:", total_nums)

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Initialize an empty counter to store word frequencies
clean_word_freq = Counter()

# Iterate over each row of the DataFrame
for _, row in df_new.iterrows():
    # Join the clean words in the 'content_clean' column of the current row into a single string
    clean_text = ' '.join(re.findall(r'\b\w+\b', row['content_clean']))
    
    # Count the word frequencies for the current row
    clean_word_freq.update(clean_text.split())

# Sort the word frequencies in descending order
sorted_clean_word_freq = sorted(clean_word_freq.items(), key=lambda x: x[1], reverse=True)

# Extract the 100 most frequent words
top_100_clean_words = sorted_clean_word_freq[:100]

# Print the top 100 most frequent words
print(top_100_clean_words)

# Extract the 10000 most frequent words
top_10000_clean_words = sorted_clean_word_freq[:10000]

# Barplot for top 10000 most frequent clean words
plt.figure(figsize=(15, 6))
words, frequencies = zip(*top_10000_clean_words)
plt.bar(words, frequencies)
plt.yscale('log')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10000 Most Frequent Clean Words')
plt.xticks([])
plt.show()

In [None]:
# Apply remove_stopwords to 'content_clean' column and create 'content_stopword' column
df_new['content_stopword'] = df_new['content_clean'].apply(remove_stopwords)

In [None]:
df_new['content_stem'] = df_new['content_stopword'].apply(remove_word_variations)

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Initialize an empty counter to store word frequencies
clean_word_freq_after = Counter()

# Iterate over each row of the DataFrame
for _, row in df_new.iterrows():
    # Join the clean words in the 'content_clean' column of the current row into a single string
    clean_text_after = ' '.join(re.findall(r'\b\w+\b', row['content_stem']))
    
    # Count the word frequencies for the current row
    clean_word_freq_after.update(clean_text.split())

# Sort in descending order
sorted_clean_word_freq_after = sorted(clean_word_freq_after.items(), key=lambda x: x[1], reverse=True)

# Extract the 100 most frequent words
top_100_clean_words_after = sorted_clean_word_freq_after[:100]

for word, frequency in top_100_clean_words_after:
    print(f"{word}: {frequency}")

# Extract the 10000 most frequent words
top_10000_clean_words_after = sorted_clean_word_freq_after[:10000]

# Barplot for top 10000 most frequent clean words
plt.figure(figsize=(12, 6))
words, frequencies = zip(*top_10000_clean_words_after)
plt.bar(words, frequencies)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10000 Most Frequent Clean Words After Preprocessing')
plt.xticks([])
plt.show()

## Task 3: Apply preprocess to '995,000_rows.csv' dataset

In [None]:
import pandas as pd
import lib.process_b as process_b

src = 'data/995,000_rows.csv'
# src = 'data/news_sample.csv'
dst = src[0:-4] + '_cleaned.csv'

# preprocess
df = pd.read_csv(src)
process_b.preprocess(df)

# save csv file copy of preprocessed dataframe
df.to_csv(dst)

## Task 4: Split the dataset into a training, validation, and test splits

In [None]:
import pandas as pd
import lib.split as split

src = 'data/995,000_rows.csv'
# src = 'data/995,000_rows_SAMPLE.csv'
# src = 'data/news_sample.csv'

df = pd.read_csv(src)
split.eighty_ten_ten(df)