In [None]:
# Import parquet data into pandas dataframe
import pandas as pd
df_train = pd.read_parquet('relevance_train.parquet')
df_test = pd.read_parquet('relevance_test.parquet')

In [None]:
#MIGHT NEED TO UNCOMMENT SOME COMMANDS WHEN RUNNING CODE FOR FIRST TIME

# Import library for tokenisation
import nltk 
#Download punctuation infrormation
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
#Import regular expressions library
import re
#Download stopwords information
nltk.download('stopwords')
from nltk.corpus import stopwords
#Import stemmer
from nltk.stem import PorterStemmer
#Import lemmatizer
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")


In [None]:
#Fetch stopwords in English
stopset = set(stopwords.words('english'))

# Define a function to remove stop words from a sentence 
def remove_stop_words(sentence): 
  # Split the sentence into individual words 
  words = sentence.split() 
  
  # List comprehension to remove stop words 
  filtered_words = [word for word in words if word not in stopset] 
  
  # Join the filtered words back into a sentence 
  return ' '.join(filtered_words)

In [None]:
#Define a function to stem all words in a sentence
ps = PorterStemmer()
def stem (sentence):
    stemmed = []
    # Split the sentence into individual words 
    words = sentence.split()
    for word in words:
        stemmed.append(ps.stem(word))
    #Join the stemmed words back into a sentence
    return ' '.join(stemmed)

In [None]:
#Define a function to find all lemmas of the words in a sentence
wl = WordNetLemmatizer()
def lemma (sentence):
    lemmas = []
    # Split the sentence into individual words 
    words = sentence.split()
    for word in words:
        lemmas.append(wl.lemmatize(word))
    #Join the stemmed words back into a sentence
    return ' '.join(lemmas)

In [None]:
#Define a function to perform language processing on dataframe collumn
def nlp (iterable):
    processed = []    
    for element in iterable:
        element = str(element)
        element = element.lower()
        element = ''.join([i for i in element if not i.isdigit()]) #Remove digits from string
        element = re.sub("\<.*?\>"," ", element) #Remove all HTML tags
        element = element.replace('\\n', '') #Remove new line token
        element = re.sub(r'[^\w\s]','',element) #Remove punctuation
        element = remove_stop_words(element) #Call remove stopwords function
        #element = lemma(element) #does not add much to the stemming
        element = stem(element)
        #Add processed element into the list
        processed.append(element)
    return processed

In [None]:
#For columns with classifiable text, create classification (label encoding)
#Note the numeric value is arbitrary so no numeric analysis is to be performed
def id_creator (iterable):
    id = []
    id_Dict = {}
    for item in iterable:
        item = str(item)
        if item not in id_Dict:
            id_Dict[item] = len(id_Dict) + 1
        id.append(id_Dict[item])
    return id

In [None]:
#Generate DataFrame with the processed text for train
processed_train = {'id': df_train['doc_id'], 
                   'author': df_train['author'], 
                   'title': nlp(df_train['title']),
                   'topic_id': df_train['topic_id'],
                   'body': nlp(df_train['body']),
                   'description_id': id_creator(df_train['description']),
                   'narrative_id': id_creator(df_train['narrative']),
                   'judgement':df_train['judgement']}
df_train_processed = pd.DataFrame(processed_train)

In [None]:
#Generate DataFrame with the processed text for test 
descriptions = list(id_creator(df_train['description'].append(df_test['description'])))
description_test = descriptions[len(df_train['description']):]
narrative = list(id_creator(df_train['narrative'].append(df_test['narrative'])))
narrative_test = descriptions[len(df_train['narrative']):]

processed_test = {'id': df_test['doc_id'], 
                   'author': df_test['author'], 
                   'title': nlp(df_test['title']),
                   'topic_id': df_test['topic_id'],
                   'body': nlp(df_test['body']),
                   'description_id': description_test,
                   'narrative_id': narrative_test}
df_test_processed = pd.DataFrame(processed_test)

In [None]:
#Export processed text to excel 
df_train_processed.to_excel('relevance_train_processed.xlsx')
df_test_processed.to_excel('relevance_test_processed.xlsx')