<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 2 - Phase 2_1

This phase aims to process the following file to prepare studies for CAD 2026 and IVACS 2026:
- `examples/score_details.txt`

## Define input variables

In [1]:
input_directory = 'cl_st2_ph2_arianne/examples'
output_directory = 'cl_st2_ph2_arianne/examples'

## Capture data from the file

In [2]:
import pandas as pd
import re

def parse_score_details(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split the file into individual text blocks based on the separator
    records = content.split('=============================================')

    data = []

    for record in records:
        record = record.strip()
        if not record or 'text ID' not in record:
            continue

        row = {}
        # Splitting by newline to process each line individually
        lines = record.split('\n')

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Extract basic ID and filename
            if line.startswith('text ID:'):
                row['text ID'] = line.split(':', 1)[1].strip()
            elif line.startswith('filename:'):
                row['filename'] = line.split(':', 1)[1].strip()

            # Check for f1, f2, f3, f4 factors
            for i in range(1, 5):
                prefix = f'f{i}'

                # Capture main score
                if line.startswith(f'{prefix} score:'):
                    score_str = line.split(':', 1)[1].strip()
                    row[f'{prefix} score'] = int(score_str) if score_str else 0

                # Capture positive count and words
                elif line.startswith(f'{prefix} pos words'):
                    match = re.search(r'\(N=(\d+)\):\s*(.*)', line)
                    if match:
                        row[f'{prefix} pos'] = int(match.group(1))
                        row[f'{prefix} pos words'] = match.group(2).strip()

                # Capture negative count and words
                elif line.startswith(f'{prefix} neg words'):
                    match = re.search(r'\(N=(\d+)\):\s*(.*)', line)
                    if match:
                        row[f'{prefix} neg'] = int(match.group(1))
                        row[f'{prefix} neg words'] = match.group(2).strip()

        # Fill in defaults for any missing data points in this record
        for i in range(1, 5):
            p = f'f{i}'
            row.setdefault(f'{p} score', 0)
            row.setdefault(f'{p} pos', 0)
            row.setdefault(f'{p} pos words', "")
            row.setdefault(f'{p} neg', 0)
            row.setdefault(f'{p} neg words', "")

        data.append(row)

    return pd.DataFrame(data)

# File path based on your project structure
file_path = f"{input_directory}/score_details.txt"

# Import into the DataFrame
score_details_df = parse_score_details(file_path)

# Verify results
print(f"Imported {len(score_details_df)} records.")
score_details_df

Imported 4000 records.


Unnamed: 0,text ID,filename,f1 score,f1 pos,f1 pos words,f1 neg,f1 neg words,f2 score,f2 pos,f2 pos words,...,f3 score,f3 pos,f3 pos words,f3 neg,f3 neg words,f4 score,f4 pos,f4 pos words,f4 neg,f4 neg words
0,t000001,t001_gemini.txt,7,7,"activist, earth, work, movement, other, humani...",0,,6,6,"right, voice, support, nature, environmental, ...",...,0,0,,0,,2,2,"whale, drilling",0,
1,t000002,t002_gemini.txt,3,3,"work, woman, witness",0,,7,7,"community, climate, reality, face, challenge, ...",...,5,5,"trend, system, waste, design, game",0,,1,1,water,0,
2,t000003,t003_gemini.txt,10,10,"dozen, day, passion, us, activist, work, exper...",0,,8,8,"community, climate, people, face, weather, lea...",...,0,0,,0,,0,0,,0,
3,t000004,t004_gemini.txt,4,4,"skill, train, work, job",0,,7,7,"community, backdrop, crisis, finance, challeng...",...,6,6,"business, solution, cost, program, design, energy",0,,0,0,,0,
4,t000005,t005_gemini.txt,5,5,"age, committee, history, year, eye",0,,7,7,"climate, crisis, duty, loss, challenge, govern...",...,5,5,"production, consumption, use, diet, pesticide",0,,3,3,"size, decline, feed",0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,t003996,t995_human.txt,82,82,"son, writer, father, hunter, bomb, man, music,...",0,,21,21,"justice, community, youth, leadership, right, ...",...,8,8,"product, quality, use, money, report, public, ...",0,,1,3,"whale, life, threat",2,"tax, seed"
3996,t003997,t996_human.txt,46,46,"father, peace, weapon, ecology, morning, boat,...",0,,12,12,"justice, right, people, care, ground, power, s...",...,3,3,"product, public, end",0,,1,2,"island, life",1,tax
3997,t003998,t997_human.txt,39,39,"son, daughter, bomb, spring, weapon, testing, ...",0,,8,8,"debate, power, stand, leader, delay, environme...",...,4,4,"announcement, money, shift, energy",0,,3,3,"sea, catch, port",0,
3998,t003999,t998_human.txt,99,99,"warrior, son, daughter, bomb, man, peace, back...",0,,22,25,"solidarity, right, violence, future, people, r...",...,21,22,"problem, amount, supply, waste, fact, use, dol...",1,rainbow,14,15,"sea, fleet, habitat, shore, seal, whale, area,...",1,association


## Export to a file

In [3]:
score_details_df.to_json(f"{output_directory}/score_details.jsonl", orient='records', lines=True)

In [4]:
score_details_df.to_excel(f"{output_directory}/score_details.xlsx", index=False)