In [None]:
# imports
import os
import re
import glob
import nltk
import h5py
import collections
import numpy as np
import pandas as pd
import pickle as pkl
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt


In [None]:
import re
import os
import glob
import nltk
import numpy as np
import pandas as pd
import pickle as pkl
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Setup
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()

# Adjust stopwords list
stopwords_list = set(stopwords.words('english'))
stopwords_keep = ['no', 'not', 'nor']
stopwords_adjusted = list(stopwords_list.difference(stopwords_keep))

def clean_text(text):
    """Function to clean a given text."""
    # Handle contracted words
    contracted_words = {
        'ca': 'can',
        'wo': 'will',
        'sha': 'shall',
        'nt': "not",
        "n't": "not"
    }
    text = ' '.join([contracted_words.get(word, word) for word in text.split()])
    # Remove punctuation & special characters
    text = ' '.join(re.split('\W+', text))
    text = ' '.join(word for word in text.split() if word.isalnum())
    # Remove nouns and numbers
    tagged_text = nltk.tag.pos_tag(text.split())
    text = ' '.join([word for word, tag in tagged_text if tag != 'CD'])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords_adjusted])
    return text

def load_and_clean_data(path, has_label=True):
    """Load data from the given path, clean it, and return a dataframe."""
    df_list = []
    for file_name in glob.glob(path):
        columns = ["label", "text"] if has_label else ["text"]
        df_temp = pd.read_csv(file_name, delimiter='\t', names=columns, encoding='latin-1')
        df_temp['domain'] = os.path.basename(file_name)
        df_list.append(df_temp)
    
    df = pd.concat(df_list, ignore_index=True)
    df['text'] = df['text'].apply(clean_text)
    # Delete empty strings
    df['text'].replace('', np.nan, inplace=True)
    df.dropna(subset=['text'], inplace=True)
    return df
import os

def ensure_directory_exists(filepath):
    """Ensure the directory of the given filepath exists."""
    # Extract the directory path from the full filepath
    directory = os.path.dirname(filepath)
    
    # Check if the directory exists
    if not os.path.exists(directory):
        # If not, create the directory (and any intermediate directories as needed)
        os.makedirs(directory)


def main():
    """Main function to execute the entire data processing pipeline."""
    # Load and clean data
    df_train = load_and_clean_data('data/uncleaned_data/train/*')
    df_test = load_and_clean_data('data/uncleaned_data/test/*')
    df_unlabel = load_and_clean_data('data/uncleaned_data/unlabelled/*', has_label=False)
    
    # Process domain names
    domain_replacements = {
        '.task.train$': '',
        '.task.test$': '',
        '.task.unlabel$': ''
    }
    for pattern, replacement in domain_replacements.items():
        df_train['domain'] = df_train['domain'].str.replace(pattern, replacement)
        df_test['domain'] = df_test['domain'].str.replace(pattern, replacement)
        df_unlabel['domain'] = df_unlabel['domain'].str.replace(pattern, replacement)
    
    # Add label to unlabelled data
    df_unlabel["label"] = 3

    # Merge datasets
    df_merged = pd.concat([df_train, df_unlabel], ignore_index=True)

   # Save data
    save_paths = {
        "data/cleaned_data/train_cleaned.p": df_train,
        "data/cleaned_data/test_cleaned.p": df_test,
        "data/cleaned_data/merged_cleaned.p": df_merged
}

    for path, dataframe in save_paths.items():
        ensure_directory_exists(path)  # Make sure the directory exists
        with open(path, "wb") as file:
            pkl.dump(dataframe, file)

if __name__ == "__main__":
    main()
