In [None]:
!pip install gdown pandas nltk pandarallel spacy autocorrect

In [None]:
import pandas as pd
import nltk

## Download base dataset
Dataset is from https://www.kaggle.com/datasets/rtatman/blog-authorship-corpus. Contains over 700000 blog posts with metadata about author and blog

In [None]:
# Download blogtext.csv (700 000 blog posts)
!gdown 1PJbVYUmRr0_HTwGNtplnu8lG-UCDoXZJ

## Import data

### Import from csv-file using Pandas
Specify the number of blog posts to process, as the processing can take time

In [None]:
N_BLOGPOSTS = 10000

df = pd.read_csv('blogtext.csv', nrows=N_BLOGPOSTS)

### Tokenize sentences
This splits all the sentences into arrays, and then explodes the array back to the dataframe as separate sentences. After this, all sentences which are not recognized as strings are removed.

In [None]:
df.text = df.text.transform(lambda t: nltk.sent_tokenize(t))

# Explode array into separate rows
df = df.explode('text')

# Remove all sentences not recognized as strings (numbers, lists etc.)
mask = df['text'].apply(lambda x: isinstance(x, str))
df = df[mask]

# Remove all sentences not containing anything
df = df[df['text'] != '']

## Preprocess and clean text
This step will clean the text contents and remove unwanted blog-posts from the dataset. Blogposts containing non-english words may be removed and a set percentage of stop-words can be removed to reduce their occurrence in the dataset.

### Specify cleaning parameters

In [None]:
remove_stopwords=False
rem_stopword_percent=0.00
remove_sentence_with_unknown_words=True
min_word_count=4

In [None]:
from nltk import word_tokenize
from nltk.corpus import words as en_words
from nltk.corpus import stopwords

import spacy

import re
import random

from pandarallel import pandarallel

spacy_nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

### Define some methods for text-cleaning and filtering

In [None]:
def contains_non_lexi_word(sentence):
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    lemmatized_doc = spacy_nlp(sentence)
    
    english_words = en_words.words()
    
    for token in lemmatized_doc:
        word = token.lemma_.lower()
        
        if word not in english_words:                               
            #print("Found non-english word {0}".format(word))
            return True
        
def contains_number(sentence):
    return any(char.isdigit() for char in sentence)

def word_count(text):
    words = word_tokenize(text)
    return len(words)

In [None]:
def clean_corpus(corpus_df, rm_contains_unknown_wrd=True, rm_contains_num=True, min_word_count=2, remove_stopwords=True, prob_remove_stopword=0.1):
    
    # Function which returns wether to keep a sentence in the dataset or not based on given parameters
    def keep_sentence(row):
        if word_count(row['text']) < min_word_count:
            #print("{0} is too short".format(row['text']))
            return False
        elif rm_contains_num and contains_number(row['text']):
            #print("{0} contains number".format(row['text']))
            return False
        elif rm_contains_unknown_wrd and contains_non_lexi_word(row['text']):
            #print("{0} contains unknown words".format(row['text']))
            return False
        else:
            #print("{0} is ok".format(row['text']))
            return True
    
    # Function to normalize and clean the text
    def clean_text(text):
        # Convert to lowercase for better normalization
        text = text.lower()
    
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
    
        # Tokenize the words
        words = word_tokenize(text)
        
        # Remove stop words
        if remove_stopwords:
            if random.random() < prob_remove_stopword:
                words = [word for word in words if word not in stop_words]
        
        # Join the cleaned words back into a sentence
        cleaned_text = ' '.join(words)
    
        return cleaned_text

    pre_rem_size = corpus_df.shape[0]
    pandarallel.initialize()
    corpus_df['text'] = corpus_df['text'].apply(clean_text)
    corpus_df = corpus_df[corpus_df.parallel_apply(keep_sentence, axis=1)]
    sen_removed = pre_rem_size - corpus_df.shape[0]
    print("Removed {0} sentences because they did not conform to the sentence specifications".format(sen_removed))
    
    return corpus_df

## Let the cleaning begin
This might take some while. Please be patient. 5000 blog posts takes around 10 minutes on a M1 Pro MacBook Pro 14" laptop. 

In [None]:
df = clean_corpus(df, remove_stopwords=remove_stopwords, rm_contains_unknown_wrd=remove_sentence_with_unknown_words, prob_remove_stopword=rem_stopword_percent, min_word_count=min_word_count)

## Lastly, save the file so it can be loaded for training

In [None]:
df.to_csv("blogtext_cleaned.csv")