In [None]:
import pandas as pd

# create a new dataset only with feature columns to be used for the simple LogReg model

# load cleaned bbc articles
src = '../data/articles_cleaned.csv'
training_data = pd.read_csv(src, index_col=0)

In [None]:
# Make grouped types ('fake' or 'reliable') into 'true' or 'false' values
def bool_dummies(df: pd.DataFrame, col: str) -> pd.DataFrame:
    type_data = pd.get_dummies(df['type'], drop_first=True)
    df = pd.concat([df, type_data], axis=1)
    return df


# Bool value of 'reliable' to be used on y-axis when training model.
# training data
training_data['reliable'] = True

In [None]:
import re

# Function to count tags, e.g. NUMs with <NUM> tag
def count_tag(text: str, tag: str) -> int:
    num_with_tag = re.findall(tag, text)
    return len(num_with_tag)

# Apply count NUMs with <NUM> tag
num_tag = '_num_'
training_data['num_count'] = training_data['content_clean'].apply(count_tag, tag=num_tag)

# Apply count DATEs with <DATE> tag
date_tag = '_date_'
training_data['date_count'] = training_data['content_clean'].apply(count_tag, tag=date_tag)

# Apply count URLs with <URL> tag
url_tag = '_url_'
training_data['url_count'] = training_data['content_clean'].apply(count_tag, tag=url_tag)

In [None]:
# Function to count single char in string
def count_char(text: str, char: str):
    return text.count(',')

# count of commas in each article
comma = ','
training_data['comma_count'] = training_data['content_clean'].apply(count_char, char=comma)

# count of exlamation points in each article
exclm = '!'
training_data['exclm_count'] = training_data['content_clean'].apply(count_char, char=exclm)

In [None]:
import nltk
import swifter

# Count unique words in text (word frequency of content_clean)
def get_word_freq(text: str) -> int:
    tokens = nltk.word_tokenize(text)
    return len(set(tokens))

# get word freq
training_data['content_word_freq'] = training_data['content_clean'].swifter.apply(get_word_freq)

In [None]:
import nltk
import swifter

# Count unique words in text (word frequency of content_clean)
def get_word_freq(text: str) -> int:
    tokens = nltk.word_tokenize(text)
    return len(set(tokens))

# word freq after stopword removal
training_data['stop_word_freq'] = training_data['content_stopword'].swifter.apply(get_word_freq)

# word freq after stemming
training_data['stem_word_freq'] = training_data['content_stem'].swifter.apply(get_word_freq)

In [None]:
# reduction rate on stopword removal
# training
col_a = training_data['content_word_freq']
col_b = training_data['stop_word_freq']
training_data['stop_reduction_rate'] = round(((col_a - col_b)/col_a) * 100, 3)

In [None]:
# reduction rate on stem removal
# training
col_a = training_data['content_word_freq']
col_b = training_data['stem_word_freq']
training_data['stem_reduction_rate'] = round(((col_a - col_b)/col_a) * 100, 3)

In [None]:
# (Avarage of use of words per sentence. per article)

import swifter

def average_sentence_length(text):
    # Split the text into sentences
    sentences = text.split('.')
    
    # Initialize variables to store total length and number of sentences
    total_length = 0
    num_sentences = 0
    
    # Iterate through each sentence to calculate total length and count the number of sentences
    for sentence in sentences:
        # Count the number of words in the sentence
        words = sentence.split()
        length = len(words)
        
        # Add the length of the current sentence to the total length
        total_length += length
        
        # Increment the number of sentences
        if length > 0:  # Exclude empty sentences
            num_sentences += 1
    
    # Calculate the average length of sentences
    if num_sentences > 0:
        average_length = total_length / num_sentences
    else:
        average_length = 0
    
    return int(average_length)

# Apply
training_data['average_sentence_length'] = training_data['content'].swifter.apply(average_sentence_length)

In [None]:
# True or false value for authors* [meta feature]
training_data['has_author'] = training_data['authors'].notnull()

In [None]:
# save to file
dst = '../data/articles_features.csv'
training_data.to_csv(dst)