<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 1 - Phase 5 - INRS

## Required Python packages

- pandas

## Importing the required libraries

In [1]:
import re
import pandas as pd
import os
import sys
import json

## Defining input variables

In [2]:
input_directory = 'cl_st1_ph4_examples'
output_directory = 'cl_st1_ph5_inrs'

## Creating output directory

In [3]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

Output directory successfully created.


## Creating a dictionary that relate the `Dimension-Pole` pair with the `Text ID`s of the respective examples

In [4]:
# Dictionary to store the parsed data with string keys
dimension_examples = {}

# RegEx pattern to extract Text ID
text_id_pattern = re.compile(r'file = (t\d{6})')

# Mappings for 'Dimension' and 'Pole'
dimension_mapping = {
    'f1': 'Dimension 1',
    'f2': 'Dimension 2',
    'f3': 'Dimension 3',
    'f4': 'Dimension 4',
    'f5': 'Dimension 5',
    'f6': 'Dimension 6'
}

pole_mapping = {
    'neg': 'Negative Pole',
    'pos': 'Positive Pole'
}

# Getting the list of files and sorting them
files = sorted([f for f in os.listdir(input_directory) if f.endswith('.txt')])

# Iterating over each sorted file in the directory
for filename in files:
    # Extracting 'Dimension' and 'Pole' from the filename
    parts = filename.split('_')
    dimension = dimension_mapping.get(parts[1], parts[1])
    pole = pole_mapping.get(parts[2], parts[2])

    # Opening and reading the file
    with open(os.path.join(input_directory, filename), 'r', encoding='utf-8') as file:
        content = file.read()
        # Find all Text IDs in the file
        text_ids = text_id_pattern.findall(content)
    
    # Construct the dictionary key
    key = f"{dimension} - {pole}"
    
    if key not in dimension_examples:
        dimension_examples[key] = {
            'Dimension': dimension,
            'Pole': pole,
            'Label': f"{key} - Label: <Placeholder for Label>",  # Placeholder for the Label
            'Description': f"{key} - Description: <Placeholder for Description>",  # Placeholder for the Description
            'Text IDs': []
        }

    # Append the Text IDs to the list in the dictionary
    dimension_examples[key]['Text IDs'].extend(text_ids)

In [5]:
type(dimension_examples)

dict

### Exporting to a file

In [6]:
# Writing the dictionary with string keys to a JSON file
with open(f"{output_directory}/dimension_examples_template.json", 'w') as json_file:
    json.dump(dimension_examples, json_file, indent=4)