In [None]:
import pandas as pd

# load raw dataset
src = 'data/995,000_rows_cleaned.csv'
# src = 'data/SAMPLE.csv'
raw_data = pd.read_csv(src)

### Amount of ',' (commas) in fake news vs. reliable news

In [None]:
# Function to count comma 
def count_comma(text):
    return text.count(',')

# Apply
raw_data['comma_count'] = raw_data['content_clean'].apply(count_comma)


In [None]:
#  Filter DataFrame for articles labeled as 'fake'
fake_articles = raw_data[raw_data['type'] == 'fake']

# Total commas in 'fake' content
total_fake_nums = fake_articles['comma_count'].sum()
print("Total commas in 'fake' content:", total_fake_nums)

# Filter DataFrame for articles labeled as 'reliable'
reliable_articles = raw_data[raw_data['type'] == 'reliable']

# Total commas in 'reliable' content
total_reliable_nums = reliable_articles['comma_count'].sum()
print("Total commas in 'reliable' content:", total_reliable_nums)

# Minimum number of commas in 'fake' content
min_fake_nums = fake_articles['comma_count'].min()
print("Minimum number of commas in 'fake' content:", min_fake_nums)

# Maximum number of commas in 'fake' content
max_fake_nums = fake_articles['comma_count'].max()
print("Maximum number of commas in 'fake' content:", max_fake_nums)

# Mean number of commas in 'fake' content
mean_fake_nums = fake_articles['comma_count'].mean()
print("Mean number of commas in 'fake' content:", mean_fake_nums)

# Minimum number of commas in 'reliable' content
min_reliable_nums = reliable_articles['comma_count'].min()
print("Minimum number of commas in 'reliable' content:", min_reliable_nums)

# Maximum number of commas in 'reliable' content
max_reliable_nums = reliable_articles['comma_count'].max()
print("Maximum number of commas in 'reliable' content:", max_reliable_nums)

# Mean number of commas in 'reliable' content
mean_reliable_nums = reliable_articles['comma_count'].mean()
print("Mean number of commas in 'reliable' content:", mean_reliable_nums)

In [None]:
import matplotlib.pyplot as plt

# get total sum of exclamation points for each type (labels)
types = ['reliable',
         'political',
         'bias',
         'fake',
         'conspiracy',
         'rumor',
         'unknown',
         'unreliable',
         'clickbait',
         'junksci',
         'satire',
         'hate'
         ]

sums = []
for type in types:
    sum = (raw_data[ (raw_data['type'] == type)])['comma_count'].mean()
    sums.append(sum)

# plot data
fig, ax = plt.subplots()
plt.xticks(rotation='vertical')

ax.set_ylabel('mean')
ax.set_title('\',\' characters in all article types')

ax.bar(types, sums)

plt.show()

### Length of sentences in reliable news vs. fake news

In [None]:
import swifter

def average_sentence_length(text):
    # Split the text into sentences
    sentences = text.split('.')
    
    # Initialize variables to store total length and number of sentences
    total_length = 0
    num_sentences = 0
    
    # Iterate through each sentence to calculate total length and count the number of sentences
    for sentence in sentences:
        # Count the number of words in the sentence
        words = sentence.split()
        length = len(words)
        
        # Add the length of the current sentence to the total length
        total_length += length
        
        # Increment the number of sentences
        if length > 0:  # Exclude empty sentences
            num_sentences += 1
    
    # Calculate the average length of sentences
    if num_sentences > 0:
        average_length = total_length / num_sentences
    else:
        average_length = 0
    
    return average_length

# Apply
raw_data['average_sentence_length'] = raw_data['content'].swifter.apply(average_sentence_length)

In [None]:
# Filter DataFrame for articles labeled as 'fake'
fake_articles = raw_data[raw_data['type'] == 'fake']

# Filter DataFrame for articles labeled as 'reliable'
reliable_articles = raw_data[raw_data['type'] == 'reliable']

# Minimum number in 'fake' content
min_fake_nums = fake_articles['average_sentence_length'].min()
print("Minimum average length in 'fake' content:", min_fake_nums)

# Maximum number in 'fake' content
max_fake_nums = fake_articles['average_sentence_length'].max()
print("Maximum average length in 'fake' content:", max_fake_nums)

# Mean number in 'fake' content
mean_fake_nums = fake_articles['average_sentence_length'].mean()
print("Mean average length in 'fake' content:", mean_fake_nums)

# Minimum number in 'reliable' content
min_reliable_nums = reliable_articles['average_sentence_length'].min()
print("Minimum average length in 'reliable' content:", min_reliable_nums)

# Maximum number in 'reliable' content
max_reliable_nums = reliable_articles['average_sentence_length'].max()
print("Maximum average length in 'reliable' content:", max_reliable_nums)

# Mean number of in 'reliable' content
mean_reliable_nums = reliable_articles['average_sentence_length'].mean()
print("Mean average length in 'reliable' content:", mean_reliable_nums)

In [None]:
import matplotlib.pyplot as plt

# get total sum of exclamation points for each type (labels)
types = ['reliable',
         'political',
         'bias',
         'fake',
         'conspiracy',
         'rumor',
         'unknown',
         'unreliable',
         'clickbait',
         'junksci',
         'satire',
         'hate'
         ]

sums = []
for type in types:
    sum = (raw_data[ (raw_data['type'] == type)])['average_sentence_length'].mean()
    sums.append(sum)

# plot data
fig, ax = plt.subplots()
plt.xticks(rotation='vertical')

ax.set_ylabel('mean')
ax.set_title('Avarage sentence length')

ax.bar(types, sums)

plt.show()

### Do Fake news have less titles then reliable news? 

In [None]:
title_counts = raw_data.groupby('type')['title'].apply(lambda x: x.notnull().mean())
print(title_counts)