In [1]:
import pandas as pd

# create a new dataset only with feature columns to be used for the simple LogReg model

# load grouped training data
src = '../data/training_data_grouped.csv'
training_data = pd.read_csv(src)

# load grouped validation data
src = '../data/validation_data_grouped.csv'
validation_data = pd.read_csv(src)

# load grouped test data
src = '../data/test_data_grouped.csv'
test_data = pd.read_csv(src)

In [2]:
# drop 'Unnamed' columns
training_data.drop(training_data.columns[[0, 1, 2]], axis=1, inplace=True)
validation_data.drop(validation_data.columns[[0, 1, 2]], axis=1, inplace=True)
test_data.drop(test_data.columns[[0, 1, 2]], axis=1, inplace=True)

In [3]:
# Make grouped types ('fake' or 'reliable') into 'true' or 'false' values
def bool_dummies(df: pd.DataFrame, col: str) -> pd.DataFrame:
    type_data = pd.get_dummies(df['type'], drop_first=True)
    df = pd.concat([df, type_data], axis=1)
    return df


# Bool value of 'reliable' to be used on y-axis when training model.
# training data
training_data = bool_dummies(training_data, 'type')

# validation data
validation_data = bool_dummies(validation_data, 'type')

# test data
test_data = bool_dummies(test_data, 'type')

# save to file
# training_data.to_csv('data/training_data_features.csv')
# validation_data.to_csv('data/validation_data_features.csv')
# test_data.to_csv('data/test_data_features.csv')

In [4]:
import re

# Function to count tags, e.g. NUMs with <NUM> tag
def count_tag(text: str, tag: str) -> int:
    num_with_tag = re.findall(tag, text)
    return len(num_with_tag)

# Apply count NUMs with <NUM> tag
num_tag = '_num_'
training_data['num_count'] = training_data['content_clean'].apply(count_tag, tag=num_tag)
validation_data['num_count'] = validation_data['content_clean'].apply(count_tag, tag=num_tag)
test_data['num_count'] = test_data['content_clean'].apply(count_tag, tag=num_tag)

# Apply count DATEs with <DATE> tag
date_tag = '_date_'
training_data['date_count'] = training_data['content_clean'].apply(count_tag, tag=date_tag)
validation_data['date_count'] = validation_data['content_clean'].apply(count_tag, tag=date_tag)
test_data['date_count'] = test_data['content_clean'].apply(count_tag, tag=date_tag)

# Apply count URLs with <URL> tag
url_tag = '_url_'
training_data['url_count'] = training_data['content_clean'].apply(count_tag, tag=url_tag)
validation_data['url_count'] = validation_data['content_clean'].apply(count_tag, tag=url_tag)
test_data['url_count'] = test_data['content_clean'].apply(count_tag, tag=url_tag)

# save to file
# training_data.to_csv('data/training_data_features.csv')
# validation_data.to_csv('data/validation_data_features.csv')
# test_data.to_csv('data/test_data_features.csv')

In [5]:
# Function to count single char in string
def count_char(text: str, char: str):
    return text.count(',')

# count of commas in each article
comma = ','
training_data['comma_count'] = training_data['content_clean'].apply(count_char, char=comma)
validation_data['comma_count'] = validation_data['content_clean'].apply(count_char, char=comma)
test_data['comma_count'] = test_data['content_clean'].apply(count_char, char=comma)

# count of exlamation points in each article
exclm = '!'
training_data['exclm_count'] = training_data['content_clean'].apply(count_char, char=exclm)
validation_data['exclm_count'] = validation_data['content_clean'].apply(count_char, char=exclm)
test_data['exclm_count'] = test_data['content_clean'].apply(count_char, char=exclm)

In [6]:
import nltk
import swifter

# Count unique words in text (word frequency of content_clean)
def get_word_freq(text: str) -> int:
    tokens = nltk.word_tokenize(text)
    return len(set(tokens))

# get word freq
training_data['content_word_freq'] = training_data['content_clean'].swifter.apply(get_word_freq)
validation_data['content_word_freq'] = validation_data['content_clean'].swifter.apply(get_word_freq)
test_data['content_word_freq'] = test_data['content_clean'].swifter.apply(get_word_freq)

  from .autonotebook import tqdm as notebook_tqdm
Pandas Apply: 100%|██████████| 346868/346868 [06:15<00:00, 922.81it/s] 
Pandas Apply: 100%|██████████| 43573/43573 [00:48<00:00, 898.94it/s] 
Pandas Apply: 100%|██████████| 43479/43479 [00:48<00:00, 889.29it/s] 


In [7]:
import nltk
import swifter

# Count unique words in text (word frequency of content_clean)
def get_word_freq(text: str) -> int:
    tokens = nltk.word_tokenize(text)
    return len(set(tokens))

# word freq after stopword removal
training_data['stop_word_freq'] = training_data['content_stopword'].swifter.apply(get_word_freq)
validation_data['stop_word_freq'] = validation_data['content_stopword'].swifter.apply(get_word_freq)
test_data['stop_word_freq'] = test_data['content_stopword'].swifter.apply(get_word_freq)

# word freq after stemming
training_data['stem_word_freq'] = training_data['content_stem'].swifter.apply(get_word_freq)
validation_data['stem_word_freq'] = validation_data['content_stem'].swifter.apply(get_word_freq)
test_data['stem_word_freq'] = test_data['content_stem'].swifter.apply(get_word_freq)

Pandas Apply: 100%|██████████| 346868/346868 [04:57<00:00, 1164.37it/s]
Pandas Apply: 100%|██████████| 43573/43573 [00:37<00:00, 1162.24it/s]
Pandas Apply: 100%|██████████| 43479/43479 [00:37<00:00, 1155.88it/s]
Pandas Apply: 100%|██████████| 346868/346868 [04:37<00:00, 1252.09it/s]
Pandas Apply: 100%|██████████| 43573/43573 [00:35<00:00, 1239.04it/s]
Pandas Apply: 100%|██████████| 43479/43479 [00:35<00:00, 1227.44it/s]


In [9]:
# save to file
training_data.to_csv('../data/training_data_features.csv')
validation_data.to_csv('../data/validation_data_features.csv')
test_data.to_csv('../data/test_data_features.csv')

In [10]:
# reduction rate on stopword removal
# training
col_a = training_data['content_word_freq']
col_b = training_data['stop_word_freq']
training_data['stop_reduction_rate'] = round(((col_a - col_b)/col_a) * 100, 3)

# validation
col_a = validation_data['content_word_freq']
col_b = validation_data['stop_word_freq']
validation_data['stop_reduction_rate'] = round(((col_a - col_b)/col_a) * 100, 3)

# test
col_a = test_data['content_word_freq']
col_b = test_data['stop_word_freq']
test_data['stop_reduction_rate'] = round(((col_a - col_b)/col_a) * 100, 3)

In [11]:
# reduction rate on stem removal
# training
col_a = training_data['content_word_freq']
col_b = training_data['stem_word_freq']
training_data['stem_reduction_rate'] = round(((col_a - col_b)/col_a) * 100, 3)

# validation
col_a = validation_data['content_word_freq']
col_b = validation_data['stem_word_freq']
validation_data['stem_reduction_rate'] = round(((col_a - col_b)/col_a) * 100, 3)

# test
col_a = test_data['content_word_freq']
col_b = test_data['stem_word_freq']
test_data['stem_reduction_rate'] = round(((col_a - col_b)/col_a) * 100, 3)

In [12]:
# save to file
training_data.to_csv('../data/training_data_features.csv')
validation_data.to_csv('../data/validation_data_features.csv')
test_data.to_csv('../data/test_data_features.csv')

In [13]:
# (Avarage of use of words per sentence. per article)

import swifter

def average_sentence_length(text):
    # Split the text into sentences
    sentences = text.split('.')
    
    # Initialize variables to store total length and number of sentences
    total_length = 0
    num_sentences = 0
    
    # Iterate through each sentence to calculate total length and count the number of sentences
    for sentence in sentences:
        # Count the number of words in the sentence
        words = sentence.split()
        length = len(words)
        
        # Add the length of the current sentence to the total length
        total_length += length
        
        # Increment the number of sentences
        if length > 0:  # Exclude empty sentences
            num_sentences += 1
    
    # Calculate the average length of sentences
    if num_sentences > 0:
        average_length = total_length / num_sentences
    else:
        average_length = 0
    
    return int(average_length)

# Apply
training_data['average_sentence_length'] = training_data['content'].swifter.apply(average_sentence_length)
validation_data['average_sentence_length'] = validation_data['content'].swifter.apply(average_sentence_length)
test_data['average_sentence_length'] = test_data['content'].swifter.apply(average_sentence_length)

Pandas Apply: 100%|██████████| 346868/346868 [00:06<00:00, 50249.39it/s]
Pandas Apply: 100%|██████████| 43573/43573 [00:00<00:00, 49500.48it/s]
Pandas Apply: 100%|██████████| 43479/43479 [00:00<00:00, 50528.66it/s]


In [14]:
# True or false value for authors* [meta feature]
training_data['has_author'] = training_data['authors'].notnull()
validation_data['has_author'] = validation_data['authors'].notnull()
test_data['has_author'] = test_data['authors'].notnull()

In [15]:
# save to file
training_data.to_csv('../data/training_data_features.csv')
validation_data.to_csv('../data/validation_data_features.csv')
test_data.to_csv('../data/test_data_features.csv')