<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 1 - Phase 1 - Marcia

The aim of this phase is to enrich the `QJPP` corpus with its respective dimension scores obtained via `TagCount`.

## Required Python packages

- pandas
- matplotlib

## Import the required libraries

In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt

## Define input variables

In [2]:
input_directory = 'cl_st3_ph3_eyamrog'
output_directory = 'cl_st3_ph3_eyamrog'
# QJPP
counts_qjpp_1 = 'cl_st3_ph3_eyamrog/qjpp/qjpp_1/tagcount/counts_qjpp_1.txt'
counts_qjpp_2 = 'cl_st3_ph3_eyamrog/qjpp/qjpp_2/tagcount/counts_qjpp_2.txt'
counts_qjpp_3 = 'cl_st3_ph3_eyamrog/qjpp/qjpp_3/tagcount/counts_qjpp_3.txt'
counts_qjpp_4 = 'cl_st3_ph3_eyamrog/qjpp/qjpp_4/tagcount/counts_qjpp_4.txt'
counts_qjpp_5 = 'cl_st3_ph3_eyamrog/qjpp/qjpp_5/tagcount/counts_qjpp_5.txt'
counts_qjpp_6 = 'cl_st3_ph3_eyamrog/qjpp/qjpp_6/tagcount/counts_qjpp_6.txt'
counts_qjpp_7 = 'cl_st3_ph3_eyamrog/qjpp/qjpp_7/tagcount/counts_qjpp_7.txt'
counts_qjpp_8 = 'cl_st3_ph3_eyamrog/qjpp/qjpp_8/tagcount/counts_qjpp_8.txt'
counts_qjpp_9 = 'cl_st3_ph3_eyamrog/qjpp/qjpp_9/tagcount/counts_qjpp_9.txt'
counts_qjpp_10 = 'cl_st3_ph3_eyamrog/qjpp/qjpp_10/tagcount/counts_qjpp_10.txt'

## Prepare data for tagging

In [1]:
import os
import re

# Define input and output directories
input_directory = 'corpus/01_all_seasons_utf_8'
output_directory = 'corpus/02_all_seasons_utf_8_fixed'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through all files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith(".txt"):
        input_path = os.path.join(input_directory, filename)
        output_path = os.path.join(output_directory, filename)

        try:
            with open(input_path, 'r', encoding='utf-8') as f:
                # Read all lines to preserve original structure
                original_lines = f.readlines()

            processed_output = []

            for line in original_lines:
                # 1. Insert space between digit and letter (e.g., "1a" -> "1 a")
                fixed_line = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', line)

                # Strip whitespace to process words, but check if line was just empty space
                stripped_line = fixed_line.strip()

                if not stripped_line:
                    # If the original line was empty (e.g. a paragraph break), preserve it
                    processed_output.append("")
                    continue

                # 2. Wrap every 10 words WITHIN the current line
                words = stripped_line.split()

                # Chunk the words into groups of 10
                for i in range(0, len(words), 10):
                    chunk = words[i:i+10]
                    processed_output.append(" ".join(chunk))

            # Join all processed lines with newlines
            final_output = "\n".join(processed_output)

            # Save to the output file
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(final_output)

            print(f"Processed: {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

print("All files processed.")

Processed: Howard_Season_7.txt
Processed: Sheldon_Season_1.txt
Processed: Howard_Season_8.txt
Processed: Amy_Season_11.txt
Processed: Penny_Season_1.txt
Processed: Amy_Season_7.txt
Processed: Raj_Season_5.txt
Processed: Leonard_Season_8.txt
Processed: Raj_Season_7.txt
Processed: Penny_Season_9.txt
Processed: Stuart_Season_9.txt
Processed: Leonard_Season_1.txt
Processed: Leonard_Season_6.txt
Processed: Sheldon_Season_2.txt
Processed: Bernadette_Season_5.txt
Processed: Raj_Season_3.txt
Processed: Howard_Season_4.txt
Processed: Sheldon_Season_5.txt
Processed: Penny_Season_2.txt
Processed: Penny_Season_8.txt
Processed: Howard_Season_10.txt
Processed: Amy_Season_12.txt
Processed: Raj_Season_9.txt
Processed: Leonard_Season_9.txt
Processed: Raj_Season_11.txt
Processed: Sheldon_Season_8.txt
Processed: Raj_Season_4.txt
Processed: Amy_Season_5.txt
Processed: Bernadette_Season_10.txt
Processed: Howard_Season_5.txt
Processed: Leonard_Season_5.txt
Processed: Amy_Season_3.txt
Processed: Penny_Season

## Extract tagcount summaries from the tagcount files

In [3]:
def tagcount_summary(tagcount_file_path):
    '''
    Extract a summary of the tagcount files
    '''
    data = []

    with open(tagcount_file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]

    i = 0
    while i < len(lines):
        # Parse first line (metadata)
        header = re.split(r'\s+', lines[i])
        filename = header[0]
        type_token = float(header[1])
        word_length = float(header[2])
        word_count = int(header[3])
        i += 1

        # Skip the 10 lines of feature counting
        i += 10

        # Parse final line: 10 values (first 5 = known factor scores, last 5 = unknown)
        tail_values = [float(val) for val in re.split(r'\s+', lines[i])]
        i += 1

        # Store as a dictionary
        entry = {
            'Filename': filename,
            'Type/Token': type_token,
            'Word Length': word_length,
            'Word Count': word_count,
            **{f'Factor {k+1} Score': tail_values[k] for k in range(5)},
            **{f'Unknown {k-4}': tail_values[k] for k in range(5, 10)}
        }
        data.append(entry)

    # Create a DataFrame
    df_tagcount_summary = pd.DataFrame(data)

    return df_tagcount_summary

## Import the data into a DataFrame

In [4]:
df_qjpp_balanced = pd.read_json(f"{input_directory}/df_qjpp_balanced.jsonl", lines=True)

In [5]:
df_qjpp_balanced['Published'] = pd.to_datetime(df_qjpp_balanced['Published'], unit='ms')

### Add the `QJPP Filename` column

In [6]:
df_qjpp_balanced['QJPP Filename'] = (
    df_qjpp_balanced['Text ID'].astype(str) + '_' +
    df_qjpp_balanced['Section Code'].astype(str) + '_' +
    df_qjpp_balanced['Paragraph Code'].astype(str) + '_qjpp.txt'
)

In [7]:
df_qjpp_balanced.shape

(9499, 17)

## Creating the `df_qjpp_dimensions` corpus from the `df_qjpp_balanced` corpus

In [8]:
df_qjpp_dimensions = df_qjpp_balanced

## Enriching `df_qjpp_dimensions` with factor scores

### Consolidating the tagcount summaries for `QJPP`

In [9]:
df_tagcount_qjpp_1 = tagcount_summary(counts_qjpp_1)
df_tagcount_qjpp_2 = tagcount_summary(counts_qjpp_2)
df_tagcount_qjpp_3 = tagcount_summary(counts_qjpp_3)
df_tagcount_qjpp_4 = tagcount_summary(counts_qjpp_4)
df_tagcount_qjpp_5 = tagcount_summary(counts_qjpp_5)
df_tagcount_qjpp_6 = tagcount_summary(counts_qjpp_6)
df_tagcount_qjpp_7 = tagcount_summary(counts_qjpp_7)
df_tagcount_qjpp_8 = tagcount_summary(counts_qjpp_8)
df_tagcount_qjpp_9 = tagcount_summary(counts_qjpp_9)
df_tagcount_qjpp_10 = tagcount_summary(counts_qjpp_10)

df_tagcount_qjpp = pd.concat([
    df_tagcount_qjpp_1,
    df_tagcount_qjpp_2,
    df_tagcount_qjpp_3,
    df_tagcount_qjpp_4,
    df_tagcount_qjpp_5,
    df_tagcount_qjpp_6,
    df_tagcount_qjpp_7,
    df_tagcount_qjpp_8,
    df_tagcount_qjpp_9,
    df_tagcount_qjpp_10
], ignore_index=True)

In [10]:
df_tagcount_qjpp.drop(columns=['Unknown 1', 'Unknown 2', 'Unknown 3', 'Unknown 4', 'Unknown 5'], inplace=True)

In [11]:
df_tagcount_qjpp.rename(columns={'Filename': 'QJPP Filename'}, inplace=True)

In [12]:
df_tagcount_qjpp.shape

(9497, 9)

### Removing texts that have been excluded by `TagCount`

The `TagCount` processing excluded a few texts that have not scored in any of the dimensions. They should be removed from the corpus.

In [13]:
tagcount_qjpp_delta = df_qjpp_balanced[
    ~df_qjpp_balanced['QJPP Filename'].isin(df_tagcount_qjpp['QJPP Filename'])
]['QJPP Filename'].tolist()

In [14]:
len(tagcount_qjpp_delta)

2

In [15]:
tagcount_qjpp_delta

['t000017_s14_p30_qjpp.txt', 't000294_s2_p1_qjpp.txt']

In [16]:
df_qjpp_dimensions = df_qjpp_dimensions[~df_qjpp_dimensions['QJPP Filename'].isin(tagcount_qjpp_delta)]

In [17]:
df_qjpp_dimensions.shape

(9497, 17)

### Merging `df_tagcount_qjpp` into `df_qjpp_dimensions` by the `QJPP Filename` column

In [18]:
df_qjpp_dimensions = df_qjpp_dimensions.merge(df_tagcount_qjpp, on='QJPP Filename', how='left')

In [19]:
df_qjpp_dimensions

Unnamed: 0,Journal,Title,Authors,Published,Vol/Issue,URL,DOI,PDF URL,Discipline,ID,...,Paragraph Code,QJPP Filename,Type/Token,Word Length,Word Count,Factor 1 Score,Factor 2 Score,Factor 3 Score,Factor 4 Score,Factor 5 Score
0,Nature Medicine,Effects of elevated systolic blood pressure on...,"Christian Razo, Catherine A. Welgan, Gregory A...",2022-10-10,Not defined,https://www.nature.com/articles/s41591-022-019...,Not defined,https://www.nature.com/articles/s41591-022-019...,Health Sciences,natm000015,...,p1,t000001_s1_p1_qjpp.txt,31.3,4.8,211,-20.19,-4.64,2.34,-4.71,-2.10
1,Nature Medicine,Effects of elevated systolic blood pressure on...,"Christian Razo, Catherine A. Welgan, Gregory A...",2022-10-10,Not defined,https://www.nature.com/articles/s41591-022-019...,Not defined,https://www.nature.com/articles/s41591-022-019...,Health Sciences,natm000015,...,p1,t000001_s8_p1_qjpp.txt,10.8,4.9,51,-27.46,-5.08,5.06,-6.61,-3.63
2,Nature Medicine,Effects of elevated systolic blood pressure on...,"Christian Razo, Catherine A. Welgan, Gregory A...",2022-10-10,Not defined,https://www.nature.com/articles/s41591-022-019...,Not defined,https://www.nature.com/articles/s41591-022-019...,Health Sciences,natm000015,...,p2,t000001_s8_p2_qjpp.txt,17.0,6.2,83,-27.20,2.02,2.91,-6.61,3.90
3,Nature Medicine,Effects of elevated systolic blood pressure on...,"Christian Razo, Catherine A. Welgan, Gregory A...",2022-10-10,Not defined,https://www.nature.com/articles/s41591-022-019...,Not defined,https://www.nature.com/articles/s41591-022-019...,Health Sciences,natm000015,...,p3,t000001_s8_p3_qjpp.txt,22.3,5.5,127,-17.10,-2.18,11.39,0.29,17.42
4,Nature Medicine,Effects of elevated systolic blood pressure on...,"Christian Razo, Catherine A. Welgan, Gregory A...",2022-10-10,Not defined,https://www.nature.com/articles/s41591-022-019...,Not defined,https://www.nature.com/articles/s41591-022-019...,Health Sciences,natm000015,...,p4,t000001_s8_p4_qjpp.txt,15.8,4.8,120,-27.13,-3.91,2.81,-6.61,4.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9492,Corpora,Learner corpus research in New Zealand,"Anna Siyanova-Chanturia, Jean Parkinson, and T...",2022-09-27,"Volume 17, Issue Supplement",https://www.euppublishing.com/doi/full/10.3366...,https://doi.org/10.3366/cor.2022.0250,https://www.euppublishing.com/doi/pdf/10.3366/...,"Linguistic, literature and arts",corp000008,...,p32,t000299_s13_p32_qjpp.txt,11.3,6.1,66,-30.89,-4.73,-8.50,-6.61,15.62
9493,Corpora,Learner corpus research in New Zealand,"Anna Siyanova-Chanturia, Jean Parkinson, and T...",2022-09-27,"Volume 17, Issue Supplement",https://www.euppublishing.com/doi/full/10.3366...,https://doi.org/10.3366/cor.2022.0250,https://www.euppublishing.com/doi/pdf/10.3366/...,"Linguistic, literature and arts",corp000008,...,p33,t000299_s13_p33_qjpp.txt,28.0,4.9,231,-22.72,-4.06,7.42,-4.87,1.13
9494,Corpora,Learner corpus research in New Zealand,"Anna Siyanova-Chanturia, Jean Parkinson, and T...",2022-09-27,"Volume 17, Issue Supplement",https://www.euppublishing.com/doi/full/10.3366...,https://doi.org/10.3366/cor.2022.0250,https://www.euppublishing.com/doi/pdf/10.3366/...,"Linguistic, literature and arts",corp000008,...,p34,t000299_s13_p34_qjpp.txt,17.0,5.7,106,-30.26,-2.01,4.62,-6.61,-0.58
9495,Corpora,Learner corpus research in New Zealand,"Anna Siyanova-Chanturia, Jean Parkinson, and T...",2022-09-27,"Volume 17, Issue Supplement",https://www.euppublishing.com/doi/full/10.3366...,https://doi.org/10.3366/cor.2022.0250,https://www.euppublishing.com/doi/pdf/10.3366/...,"Linguistic, literature and arts",corp000008,...,p1,t000299_s4_p1_qjpp.txt,27.5,5.2,215,-22.67,-4.20,3.79,-6.61,5.89


In [20]:
df_qjpp_dimensions.isna().sum()

Journal           0
Title             0
Authors           0
Published         0
Vol/Issue         0
URL               0
DOI               0
PDF URL           0
Discipline        0
ID                0
Text ID           0
Section           0
Paragraph         0
Text Paragraph    0
Section Code      0
Paragraph Code    0
QJPP Filename     0
Type/Token        0
Word Length       0
Word Count        0
Factor 1 Score    0
Factor 2 Score    0
Factor 3 Score    0
Factor 4 Score    0
Factor 5 Score    0
dtype: int64

#### Exporting to a file

In [21]:
df_qjpp_dimensions.to_json(f"{output_directory}/df_qjpp_dimensions.jsonl", orient='records', lines=True)

In [22]:
df_qjpp_dimensions.to_csv(f"{output_directory}/df_qjpp_dimensions.tsv", sep='\t', index=False, encoding='utf-8', lineterminator='\n')

In [23]:
df_qjpp_dimensions.to_excel(f"{output_directory}/df_qjpp_dimensions.xlsx")