<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Topical Modelling Task

## Importing the required libraries

In [1]:
import subprocess
from IPython.display import clear_output
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess

## Collecting input
- Provide the tokens full filename
- Provide the stoplist full filename

In [2]:
end = False
while end == False:
    filename = str(input('Enter the tokens full filename: '))
    if filename != '':
        try:
            with open(filename, 'r', encoding = 'utf8') as tokens:
                print('The file exists.')
            input_file = filename
            output_file = input_file + '.prepared.txt'
            end = True
        except FileNotFoundError:
            print('No such file.')
end = False
while end == False:
    filename = str(input('Enter the stoplist full filename: '))
    if filename != '':
        try:
            with open(filename, 'r', encoding = 'utf8') as stoplist:
                print('The file exists.')
            stoplist_file = filename
            output_file = input_file + '.prepared.txt'
            end = True
            clear_output()
        except FileNotFoundError:
            print('No such file.')

## Preparing the data

In [3]:
substitutions = subprocess.run(['wsl', 'sed',  '-f',  stoplist_file, input_file], stdout = subprocess.PIPE)
subprocess.run(['wsl', 'tr', '-s', "' '", '>', output_file], input = substitutions.stdout, capture_output = True)

CompletedProcess(args=['wsl', 'tr', '-s', "' '", '>', 'group_4_tokens.txt.prepared.txt'], returncode=0, stdout=b'', stderr=b'')

## Processing the Topical Modelling

In [4]:
# Load your text data from the file into a list
text_data = []

input_file_path = output_file  # Update with the actual input file path

with open(input_file_path, 'r', encoding = 'utf-8') as input_file1:
    for line in input_file1:
        fields = line.strip().split('|')
        
        if len(fields) >= 4:
            content_to_analyze = fields[3]  # Assuming field 7 is at index 6
            text_data.append((fields[0], content_to_analyze))  # Store ID and content

# Preprocess the text data
def preprocess(text):
    return simple_preprocess(text, deacc = True)

processed_text_data = [preprocess(text) for _, text in text_data]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(processed_text_data)

# Create a corpus: a list of bags of words
corpus = [dictionary.doc2bow(text) for text in processed_text_data]

# Train the LDA model
num_topics = 2  # You can adjust the number of topics
lda_model = LdaModel(corpus, num_topics = num_topics, id2word = dictionary, passes = 15)

# Initialize an empty list to store results
results = []

# Score each text and gather results
for (id_value, content), text in zip(text_data, processed_text_data):
    doc_bow = dictionary.doc2bow(text)
    doc_topics = lda_model[doc_bow]
    
    # Calculate scores for each topic and store in a dictionary
    topic_scores = {topic: score for topic, score in doc_topics}
    
    # Find the topic with the highest score
    top_topic = max(topic_scores, key=topic_scores.get)
    top_score = topic_scores[top_topic]
    
    # Prepare the result dictionary
    result = {
        'id': id_value,
        'topic': top_topic,
        'score': top_score,
        'c': content
    }
    
    results.append(result)

# Save results to a file
output_file_path = output_file + '.topic_scores_output.txt'  # Update with desired output file path

with open(output_file_path, 'w', encoding = 'utf-8') as output_file1:
    for result in results:
        output_file1.write(f"{result['id']}|topic:{result['topic']}|score:{result['score']}|c:{result['c']}\n")

# Get the list of topics with top words
topics = lda_model.print_topics(num_words = 50)

# Save topics to a file
output_file_path = output_file + '.topic_words_output.txt'  # Update with desired output file path

with open(output_file_path, 'w', encoding = 'utf-8') as output_file2:
    for topic_num, words in topics:
        output_file2.write(f"Topic {topic_num + 1} Words: {words}\n")


## Processing the Score Corpus

In [5]:
# Read topic scores from the first file
topic_scores = {}
with open(output_file + '.topic_scores_output.txt', 'r', encoding = 'utf-8') as f:
    for line in f:
        parts = line.strip().split('|')
        if len(parts) >= 3:
            id_part = parts[0]
            score_part = parts[2]
            topic_scores[id_part] = (score_part, parts[1])

# Read and process the corpus file
output_lines = []
with open(input_file, 'r', encoding = 'utf-8') as f:  # Update with the actual input file path
    for line in f:
        parts = line.strip().split('|')
        id_part = parts[0]
        if id_part in topic_scores:
            score_part, topic_part = topic_scores[id_part]
            parts.insert(2, f"{topic_part}")
            parts.insert(3, f"{score_part}")
        output_line = '|'.join(parts)
        output_lines.append(output_line)

# Save the scored corpus to the output file
output_filename = output_file + '.corpus_scored.txt'
with open(output_filename, 'w', encoding='utf-8') as output_file3:
    for line in output_lines:
        output_file3.write(line + '\n')

print('Scoring completed. Scored corpus saved in ' + output_filename + '.')

Scoring completed. Scored corpus saved in group_4_tokens.txt.prepared.txt.corpus_scored.txt.


## Determining the topic words

In [6]:
# Specify the input and output file paths
input_file_path = output_file + '.topic_words_output.txt'  # Replace with the actual input file path
output_file_path = output_file + '.topic_words_formatted.txt'  # Replace with the desired output file path

# Initialize an empty result string
result = ""

# Read input from the specified file
with open(input_file_path, 'r') as file:
    lines = file.readlines()

# Loop through each line and process it
for line in lines:
    # Extract the topic number and words
    topic_parts = line.split(':')
    topic_number = topic_parts[0].strip()
    topic_words = topic_parts[1].strip()
    
    # Extract individual words and their weights
    word_weight_pairs = topic_words.split(' + ')
    
    # Create a list to store formatted word-weight pairs
    formatted_pairs = []
    
    for pair in word_weight_pairs:
        parts = pair.split('*"')
        weight = parts[0].strip('0')  # Remove leading zeros
        word = parts[1].strip().strip('"')
        
        # Replace underscores with backslash-underscore
        word = word.replace('_', '\\_')
        
        formatted_pair = f"{word} ({weight})"
        formatted_pairs.append(formatted_pair)
    
    # Combine the formatted word-weight pairs into a single line
    formatted_line = ', '.join(formatted_pairs)
    
    # Add the formatted line to the result
    result += f"{topic_number}: {formatted_line}\n"

# Write the result to the specified output file
with open(output_file_path, 'w') as output_file:
    output_file.write(result)

print(f"Output written to {output_file_path}")

Output written to group_4_tokens.txt.prepared.txt.topic_words_formatted.txt
