In [1]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.models import FastText

nltk.download('stopwords')
nltk.download('punkt')

nltk_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\karti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Preprocessing functions without noun and NER functions
def remove_special_chars_and_punctuation(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

def remove_single_alphabet_tokens(text):
    return re.sub(r'\b[a-zA-Z]\b', '', text)

def remove_numerical_tokens(text):
    return re.sub(r'\b\d+\b', '', text)

def remove_alphanumeric_tokens(text):
    return re.sub(r'\b(?=.*\d)(?=.*[a-zA-Z])[a-zA-Z0-9]+\b', '', text)

def remove_pgp_key_patterns(text):
    return re.sub(r'\b(pgp|begin|end|key|public|block)\b', '', text, flags=re.IGNORECASE)

def remove_tokens_with_non_standard_characters(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

def custom_preprocessor_no_nouns_ner(text):
    text = remove_special_chars_and_punctuation(text)
    text = remove_single_alphabet_tokens(text)
    text = remove_numerical_tokens(text)
    text = remove_alphanumeric_tokens(text)
    text = remove_pgp_key_patterns(text)
    text = remove_tokens_with_non_standard_characters(text)
    words = nltk.word_tokenize(text.lower())
    words = [word for word in words if word not in nltk_stopwords]
    return ' '.join(words)

In [3]:
# Load the dataset
input_file_path = 'cleaned_english_posts.csv' 
data = pd.read_csv(input_file_path)

# Preprocessing for FastText training
data['processed_content_ft'] = data['post_content'].apply(lambda x: custom_preprocessor_no_nouns_ner(str(x)))

# Removing rows where 'processed_content_ft' is empty after preprocessing
data_ft = data[data['processed_content_ft'].str.strip() != '']

In [4]:
# Preparing the tokenized corpus for FastText
tokenized_corpus_ft = [doc.split() for doc in data_ft['processed_content_ft'].tolist()]

In [5]:
# Training the FastText model on the preprocessed corpus without nouns and NER
fasttext_model = FastText(
    sentences=tokenized_corpus_ft,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=1,
    epochs=10
)

In [6]:
# Saving the model
fasttext_model_path = 'final_fasttext_model.bin'

# Saving the model in binary format
fasttext_model.wv.save_word2vec_format(fasttext_model_path, binary=True)

print(f"FastText model saved to {fasttext_model_path}")

FastText model saved to final_fasttext_model.bin
