<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 1 - Phase 1 - Marcia

The aim of this phase is to:
- Prepare the corpus for tagging with Biber Tagger;
- Extract Biber's (1988) Dimension Scores of each text from the Biber Tag Count file.

## Prepare the corpus for tagging with Biber Tagger

In [1]:
import os
import re

# Define input and output directories
input_directory = 'corpus/01_all_seasons_utf_8'
output_directory = 'corpus/02_all_seasons_utf_8_fixed'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through all files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith(".txt"):
        input_path = os.path.join(input_directory, filename)
        output_path = os.path.join(output_directory, filename)

        try:
            with open(input_path, 'r', encoding='utf-8') as f:
                # Read all lines to preserve original structure
                original_lines = f.readlines()

            processed_output = []

            for line in original_lines:
                # 1. Insert space between digit and letter (e.g., "1a" -> "1 a")
                fixed_line = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', line)

                # Strip whitespace to process words, but check if line was just empty space
                stripped_line = fixed_line.strip()

                if not stripped_line:
                    # If the original line was empty (e.g. a paragraph break), preserve it
                    processed_output.append("")
                    continue

                # 2. Wrap every 10 words WITHIN the current line
                words = stripped_line.split()

                # Chunk the words into groups of 10
                for i in range(0, len(words), 10):
                    chunk = words[i:i+10]
                    processed_output.append(" ".join(chunk))

            # Join all processed lines with newlines
            final_output = "\n".join(processed_output)

            # Save to the output file
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(final_output)

            print(f"Processed: {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

print("All files processed.")

Processed: Amy_Season_10.txt
Processed: Amy_Season_11.txt
Processed: Amy_Season_12.txt
Processed: Amy_Season_3.txt
Processed: Amy_Season_4.txt
Processed: Amy_Season_5.txt
Processed: Amy_Season_6.txt
Processed: Amy_Season_7.txt
Processed: Amy_Season_8.txt
Processed: Amy_Season_9.txt
Processed: Bernadette_Season_10.txt
Processed: Bernadette_Season_3.txt
Processed: Bernadette_Season_4.txt
Processed: Bernadette_Season_5.txt
Processed: Bernadette_Season_6.txt
Processed: Bernadette_Season_7.txt
Processed: Bernadette_Season_8.txt
Processed: Bernadette_Season_9.txt
Processed: Bernardette_Season_11.txt
Processed: Bernardette_Season_12.txt
Processed: Howard_Season_1.txt
Processed: Howard_Season_10.txt
Processed: Howard_Season_11.txt
Processed: Howard_Season_12.txt
Processed: Howard_Season_2.txt
Processed: Howard_Season_3.txt
Processed: Howard_Season_4.txt
Processed: Howard_Season_5.txt
Processed: Howard_Season_6.txt
Processed: Howard_Season_7.txt
Processed: Howard_Season_8.txt
Processed: Howard_

## Extract Biber's (1988) Dimension Scores of each text from the Biber Tag Count file

In [2]:
import pandas as pd

def extract_dimensions_from_counts(file_path):
    """
    Captures the filename from the first line and the first 5 values of the 12th line
    for each record in the provided counts file.
    """
    data = []

    with open(file_path, 'r', encoding='utf-8') as f:
        # Read lines and strip whitespace, filtering out empty lines to ensure structure
        lines = [line.strip() for line in f if line.strip()]

    # The file layout consists of repeating blocks of 12 lines per file
    block_size = 12

    for i in range(0, len(lines), block_size):
        # Check if we have a complete block
        if i + 11 >= len(lines):
            break

        # 1. Capture the filename from the first line of the block
        # The line format is: filename value value value
        first_line = lines[i]
        filename = first_line.split()[0]

        # 2. Capture the 5 first values of the 12th line of the block
        twelfth_line = lines[i + 11]
        values = twelfth_line.split()

        # Extract first 5 values and convert to float
        dimension_values = [float(v) for v in values[:5]]

        # Combine filename and dimensions
        row = [filename] + dimension_values
        data.append(row)

    # Create DataFrame with specified column names
    columns = ['Filename', 'Dimension 1', 'Dimension 2', 'Dimension 3', 'Dimension 4', 'Dimension 5']
    df = pd.DataFrame(data, columns=columns)

    return df

# Usage example:
# df = extract_dimensions_from_counts('counts.txt')
# print(df.head())

In [3]:
# Define the path to the Biber Tag Count file
counts_all_seasons_utf_8 = 'corpus/03_all_seasons_utf_8_tagged/tagcount/counts.txt'

# Extract dimensions from counts
df_dimensions_all_seasons_utf_8 = extract_dimensions_from_counts(counts_all_seasons_utf_8)

In [4]:
df_dimensions_all_seasons_utf_8

Unnamed: 0,Filename,Dimension 1,Dimension 2,Dimension 3,Dimension 4,Dimension 5
0,amy_season_10.txt,34.71,-2.08,-3.24,-1.21,6.22
1,amy_season_11.txt,47.85,-1.30,-3.99,2.55,7.92
2,amy_season_12.txt,51.11,-1.60,-3.23,1.87,5.86
3,amy_season_3.txt,9.50,-1.31,-7.90,9.31,-3.63
4,amy_season_4.txt,13.98,-2.65,-0.57,0.17,2.83
...,...,...,...,...,...,...
86,stuart_season_5.txt,18.89,-3.24,-0.49,-2.76,3.61
87,stuart_season_6.txt,35.28,-3.74,-6.12,-1.18,0.89
88,stuart_season_7.txt,29.91,-2.23,-4.12,-2.91,4.72
89,stuart_season_8.txt,33.94,-0.30,-5.07,-2.35,5.61


In [5]:
df_dimensions_all_seasons_utf_8.shape

(91, 6)

## Exporting to a file

In [7]:
import os
import pandas as pd

# Define output directory
output_directory = 'cl_st1_ph1_marcia'
os.makedirs(output_directory, exist_ok=True)

# Define output filename
filename = 'dimensions_all_seasons_utf_8'

# Export to JSONL
df_dimensions_all_seasons_utf_8.to_json(f"{output_directory}/{filename}.jsonl", orient='records', lines=True)

# Export to Excel
df_dimensions_all_seasons_utf_8.to_excel(f"{output_directory}/{filename}.xlsx", index=False)