<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Phase 1 - Claudia

The purpose of this document is extracting data for a pilot Lexical Multi-Dimensional Analysis.

## Required Python packages

- pandas
- PyMuPDF
- tqdm

## Import the required libraries

In [1]:
import os
import sys
import pandas as pd
import fitz  # PyMuPDF
import logging
import re
from pathlib import Path
from tqdm import tqdm
import shutil
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Define input variables

In [2]:
input_directory = '3Corpus_VEm_dividido'
output_directory = 'dataset'
log_filename = f"{output_directory}/cl_st1_ph1_claudia.log"

## Create output directory

In [3]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

Output directory successfully created.


## Set up logging

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename = log_filename
)

## Normalise the filenames

Task: Rename PDF files in `input_directory` so that:
- Replace any hyphen between issue and part with underscore
- Replace the dot between issue and part with underscore
- Preserve zero padding and other parts. Examples:

  VEm_01.1.pdf -> VEm_01_1.pdf

  Vem-16-1.pdf -> VEm_16_1.pdf

  VEm-18-2.pdf -> VEm_18_2.pdf

  VEm-19-10.pdf -> VEm_19_10.pdf

In [4]:
pattern = re.compile(r'^ve[mn][\-_]?(\d{2})[.\-_](\d+)\.pdf$', re.IGNORECASE)

renamed = []
skipped = []

for p in Path(input_directory).glob('*.pdf'):
    m = pattern.match(p.name)
    if not m:
        skipped.append(p.name)
        continue
    issue = m.group(1)
    part = m.group(2)
    new_name = f"VEm_{issue}_{part}.pdf"
    new_path = p.with_name(new_name)
    if new_path != p:
        p.rename(new_path)
        renamed.append((p.name, new_name))

pd.DataFrame(renamed, columns=['old_name', 'new_name'])

Unnamed: 0,old_name,new_name
0,Vem-16-1.pdf,VEm_16_1.pdf
1,Vem-16-2.pdf,VEm_16_2.pdf
2,Vem-16-3.pdf,VEm_16_3.pdf
3,Vem-16-4.pdf,VEm_16_4.pdf
4,Vem-16-5.pdf,VEm_16_5.pdf
...,...,...
154,VEm_30.05.pdf,VEm_30_05.pdf
155,VEm_30.1.pdf,VEm_30_1.pdf
156,VEm_30.2.pdf,VEm_30_2.pdf
157,VEm_30.3.pdf,VEm_30_3.pdf


## Import the directory structure into a DataFrame

In [5]:
# Initialising an empty list to hold the directory information
directory_data = []

# Walking through the directory structure
for root, dirs, files in os.walk(input_directory):
    #for file in tqdm(files, desc='Processing files'):
    for file in files:
        if file.endswith('.pdf'):
            try:
                # Getting the full file path
                file_path = os.path.join(root, file)
                # Splitting the root into individual directories
                directory_parts = root.split(os.sep)
                # Creating a dictionary for this file's information
                file_info = {'File': file}
                # Adding the file path to the dictionary
                file_info['File Path'] = file_path
                # Adding each part of the directory to the dictionary with appropriate keys
                for i, part in enumerate(directory_parts):
                    file_info[f"Directory Level {i+1}"] = part

                # Opening the PDF file
                document = fitz.open(file_path)
                # Extracting text from each page
                text = ''
                for page_num in range(document.page_count):
                    page = document[page_num]
                    text += page.get_text()

                file_info['Scraped Text'] = text
                document.close()

                # Adding the file info to the list
                directory_data.append(file_info)

                # Logging the successful extraction
                logging.info(f"Successfully scraped {file_path}")
            except Exception as e:
                # Logging any errors
                logging.error(f"Error scraping {file_path}: {str(e)}")

# Converting the list of dictionaries into a DataFrame
df = pd.DataFrame(directory_data)

In [6]:
df

Unnamed: 0,File,File Path,Directory Level 1,Scraped Text
0,VEm_01_1.pdf,3Corpus_VEm_dividido\VEm_01_1.pdf,3Corpus_VEm_dividido,junho e julho de 2020\nnúmero 1\nMesmo \nfrent...
1,VEm_01_2.pdf,3Corpus_VEm_dividido\VEm_01_2.pdf,3Corpus_VEm_dividido,em Intercultural Competence. São \nalunos de Q...
2,VEm_01_3.pdf,3Corpus_VEm_dividido\VEm_01_3.pdf,3Corpus_VEm_dividido,"Fatec \nIndaiatuba: \nalém \nde\nNóbrega, part..."
3,VEm_01_4.pdf,3Corpus_VEm_dividido\VEm_01_4.pdf,3Corpus_VEm_dividido,"Fatecs) se distribuíram em seis \ntimes, para ..."
4,VEm_02_1.pdf,3Corpus_VEm_dividido\VEm_02_1.pdf,3Corpus_VEm_dividido,Esta segunda edição de VEm \ncom PCI traz uma ...
...,...,...,...,...
154,VEm_30_05.pdf,3Corpus_VEm_dividido\VEm_30_05.pdf,3Corpus_VEm_dividido,p. 10\nnúmero 30| julho e agosto | 2025\nVirtu...
155,VEm_30_1.pdf,3Corpus_VEm_dividido\VEm_30_1.pdf,3Corpus_VEm_dividido,número 30| julho e agosto | 2025\nVirtual \nEx...
156,VEm_30_2.pdf,3Corpus_VEm_dividido\VEm_30_2.pdf,3Corpus_VEm_dividido,"Osvaldo Succi Junior, coordenador da área de A..."
157,VEm_30_3.pdf,3Corpus_VEm_dividido\VEm_30_3.pdf,3Corpus_VEm_dividido,p. 10\nnúmero 30| julho e agosto | 2025\nVirtu...


In [7]:
# Identify rows where 'Scraped Text' is an empty string and display them
empty_rows = df[df['Scraped Text'] == '']
empty_rows if not empty_rows.empty else pd.DataFrame(columns=df.columns)

Unnamed: 0,File,File Path,Directory Level 1,Scraped Text
68,VEm_16_2.pdf,3Corpus_VEm_dividido\VEm_16_2.pdf,3Corpus_VEm_dividido,
69,VEm_16_3.pdf,3Corpus_VEm_dividido\VEm_16_3.pdf,3Corpus_VEm_dividido,
70,VEm_16_4.pdf,3Corpus_VEm_dividido\VEm_16_4.pdf,3Corpus_VEm_dividido,
71,VEm_16_5.pdf,3Corpus_VEm_dividido\VEm_16_5.pdf,3Corpus_VEm_dividido,
73,VEm_17_2.pdf,3Corpus_VEm_dividido\VEm_17_2.pdf,3Corpus_VEm_dividido,
74,VEm_17_3.pdf,3Corpus_VEm_dividido\VEm_17_3.pdf,3Corpus_VEm_dividido,
75,VEm_17_4.pdf,3Corpus_VEm_dividido\VEm_17_4.pdf,3Corpus_VEm_dividido,
76,VEm_17_5.pdf,3Corpus_VEm_dividido\VEm_17_5.pdf,3Corpus_VEm_dividido,
77,VEm_17_6.pdf,3Corpus_VEm_dividido\VEm_17_6.pdf,3Corpus_VEm_dividido,
78,VEm_17_7.pdf,3Corpus_VEm_dividido\VEm_17_7.pdf,3Corpus_VEm_dividido,
