<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 1 - Phase 5 - INRS

## Required Python packages

- pandas

## Importing the required libraries

In [1]:
from dotenv import load_dotenv
import openai
import re
import pandas as pd
import os
import sys
import json
import logging
from tqdm import tqdm
import time

## Defining input variables

In [2]:
input_directory = 'cl_st1_ph4_examples'
input_directory_anova = 'cl_st1_inrs_anova'
output_directory = 'cl_st1_ph5_inrs'

## Creating output directory

In [3]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

Output directory successfully created.


## Creating a dictionary that relate the `Dimension-Pole` pair with the `Text ID`s of the respective examples

In [3]:
# Dictionary to store the parsed data with string keys
dimension_examples = {}

# RegEx pattern to extract Text ID
text_id_pattern = re.compile(r'file = (t\d{6})')

# Mappings for 'Dimension' and 'Pole'
dimension_mapping = {
    'f1': 'Dimension 1',
    'f2': 'Dimension 2',
    'f3': 'Dimension 3',
    'f4': 'Dimension 4',
    'f5': 'Dimension 5',
    'f6': 'Dimension 6'
}

pole_mapping = {
    'neg': 'Negative Pole',
    'pos': 'Positive Pole'
}

# Getting the list of files and sorting them
files = sorted([f for f in os.listdir(input_directory) if f.endswith('.txt')])

# Iterating over each sorted file in the directory
for filename in files:
    # Extracting 'Dimension' and 'Pole' from the filename
    parts = filename.split('_')
    dimension = dimension_mapping.get(parts[1], parts[1])
    pole = pole_mapping.get(parts[2], parts[2])

    # Opening and reading the file
    with open(os.path.join(input_directory, filename), 'r', encoding='utf-8') as file:
        content = file.read()
        # Find all Text IDs in the file
        text_ids = text_id_pattern.findall(content)
    
    # Construct the dictionary key
    key = f"{dimension} - {pole}"
    
    if key not in dimension_examples:
        dimension_examples[key] = {
            'Dimension': dimension,
            'Pole': pole,
            'Label': f"{key} - Label: <Placeholder for Label>",  # Placeholder for the Label
            'Description': f"{key} - Description: <Placeholder for Description>",  # Placeholder for the Description
            'Text IDs': []
        }

    # Append the Text IDs to the list in the dictionary
    dimension_examples[key]['Text IDs'].extend(text_ids)

In [4]:
type(dimension_examples)

dict

### Exporting to a file

In [5]:
# Writing the dictionary with string keys to a JSON file
with open(f"{output_directory}/dimension_examples_template.json", 'w') as json_file:
    json.dump(dimension_examples, json_file, indent=4)

### Compiling the final `dimension_examples.json` dictionary

- Copy `dimension_examples_template.json` as `dimension_examples.json`;
- Complete `dimension_examples.json` with the dimension poles labels and descriptions.

## Creating DataFrames according to the definitions in the `dimension_examples.json` dictionary

### Loading the `dimension_examples.json` dictionary

In [6]:
with open(f"{output_directory}/dimension_examples.json", 'r') as json_file:
    dimension_examples = json.load(json_file)

### Importing the enriched Target Corpus into a DataFrame

In [7]:
df_enriched_tc = pd.read_json(f"{input_directory_anova}/debates_turns_parties_scores.jsonl", lines=True)

In [8]:
df_enriched_tc['Date'] = pd.to_datetime(df_enriched_tc['Date'], unit='ms')

In [9]:
df_enriched_tc

Unnamed: 0,File,Title,Debate,Date,Decade,Election,Participants,Moderators,Speaker,Party,Factor 1,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Text
0,t000000,"September 29, 2020 Debate Transcript",Presidential Debate at Case Western Reserve Un...,2020-09-29,2020,2020 Election,Former Vice President Joe Biden (D) and Presid...,Chris Wallace (Fox News),TRUMP,Republican,1,8,0,3,0,3,"Thank you very much, Chris. I will tell you ve..."
1,t000001,"September 29, 2020 Debate Transcript",Presidential Debate at Case Western Reserve Un...,2020-09-29,2020,2020 Election,Former Vice President Joe Biden (D) and Presid...,Chris Wallace (Fox News),BIDEN,Democratic,0,1,0,0,0,0,"Well, first of all, thank you for doing this a..."
2,t000002,"September 29, 2020 Debate Transcript",Presidential Debate at Case Western Reserve Un...,2020-09-29,2020,2020 Election,Former Vice President Joe Biden (D) and Presid...,Chris Wallace (Fox News),BIDEN,Democratic,11,11,-1,5,-7,0,The American people have a right to have a say...
3,t000003,"September 29, 2020 Debate Transcript",Presidential Debate at Case Western Reserve Un...,2020-09-29,2020,2020 Election,Former Vice President Joe Biden (D) and Presid...,Chris Wallace (Fox News),TRUMP,Republican,1,3,0,0,-2,0,There aren’t a hundred million people with pre...
4,t000004,"September 29, 2020 Debate Transcript",Presidential Debate at Case Western Reserve Un...,2020-09-29,2020,2020 Election,Former Vice President Joe Biden (D) and Presid...,Chris Wallace (Fox News),TRUMP,Republican,4,2,0,0,-1,0,"During that period of time, during that period..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3423,t003473,"October 21, 1960 Debate Transcript",The Fourth Kennedy-Nixon Presidential Debate,1960-10-21,1960,1960 Election,Kennedy-Nixon,QUINCY HOWE,MR. NIXON,Republican,-7,25,-3,-14,-3,0,I would say that the issue will stay with us a...
3424,t003474,"October 21, 1960 Debate Transcript",The Fourth Kennedy-Nixon Presidential Debate,1960-10-21,1960,1960 Election,Kennedy-Nixon,QUINCY HOWE,MR. KENNEDY,Democratic,0,25,2,-15,0,0,"Well, Mr. Nixon, to go back to 1955. The resol..."
3425,t003475,"October 21, 1960 Debate Transcript",The Fourth Kennedy-Nixon Presidential Debate,1960-10-21,1960,1960 Election,Kennedy-Nixon,QUINCY HOWE,MR. KENNEDY,Democratic,0,0,0,-1,0,0,And that’s the testimony of uh – General Twini...
3426,t003476,"October 21, 1960 Debate Transcript",The Fourth Kennedy-Nixon Presidential Debate,1960-10-21,1960,1960 Election,Kennedy-Nixon,QUINCY HOWE,MR. KENNEDY,Democratic,12,72,-4,-7,1,-1,I uh – said that I’ve served this country for ...


### Creating a DataFrame for each dictionary entry

- The `df_filtered` DataFrame is explicitly copied using `.copy()` to avoid any `SettingWithCopyWarning`;
- When setting `Text ID Order`, `.loc` is used to ensure the assignment is made directly on the DataFrame.

In [10]:
for key, value in dimension_examples.items():
    # Filtering the 'df_enriched_tc' DataFrame to only include rows with 'Text ID's from the dictionary entry
    matching_text_ids = value['Text IDs']
    df_filtered = df_enriched_tc[df_enriched_tc['File'].isin(matching_text_ids)].copy()
    
    # Ensuring the order matches the order in the dictionary
    df_filtered.loc[:, 'Text ID Order'] = pd.Categorical(df_filtered['File'], categories=matching_text_ids, ordered=True)
    df_filtered = df_filtered.sort_values('Text ID Order').drop('Text ID Order', axis=1)

    # Adding additional columns as needed
    df_filtered['Dimension'] = value['Dimension']
    df_filtered['Pole'] = value['Pole']
    df_filtered['Label'] = value['Label']
    df_filtered['Description'] = value['Description']
    
    # Exporting the DataFrame into a JSONL document
    df_filtered[['File', 'Title', 'Debate', 'Date', 'Decade', 'Election', 'Participants', 'Moderators', 'Speaker', 'Party', 'Factor 1', 'Factor 2', 'Factor 3', 'Factor 4', 'Factor 5', 'Factor 6', 'Dimension', 'Pole', 'Label', 'Description', 'Text']].to_json(f"{output_directory}/{key}.jsonl", orient='records', lines=True)

#### Importing the DataFrames

In [11]:
df_dim1_neg = pd.read_json(f"{output_directory}/Dimension 1 - Negative Pole.jsonl", lines=True)
df_dim1_neg['Date'] = pd.to_datetime(df_dim1_neg['Date'], unit='ms')

In [12]:
df_dim1_pos = pd.read_json(f"{output_directory}/Dimension 1 - Positive Pole.jsonl", lines=True)
df_dim1_pos['Date'] = pd.to_datetime(df_dim1_pos['Date'], unit='ms')

In [13]:
df_dim2_neg = pd.read_json(f"{output_directory}/Dimension 2 - Negative Pole.jsonl", lines=True)
df_dim2_neg['Date'] = pd.to_datetime(df_dim2_neg['Date'], unit='ms')

In [14]:
df_dim2_pos = pd.read_json(f"{output_directory}/Dimension 2 - Positive Pole.jsonl", lines=True)
df_dim2_pos['Date'] = pd.to_datetime(df_dim2_pos['Date'], unit='ms')

In [15]:
df_dim3_neg = pd.read_json(f"{output_directory}/Dimension 3 - Negative Pole.jsonl", lines=True)
df_dim3_neg['Date'] = pd.to_datetime(df_dim3_neg['Date'], unit='ms')

In [16]:
df_dim3_pos = pd.read_json(f"{output_directory}/Dimension 3 - Positive Pole.jsonl", lines=True)
df_dim3_pos['Date'] = pd.to_datetime(df_dim3_pos['Date'], unit='ms')

In [17]:
df_dim4_neg = pd.read_json(f"{output_directory}/Dimension 4 - Negative Pole.jsonl", lines=True)
df_dim4_neg['Date'] = pd.to_datetime(df_dim4_neg['Date'], unit='ms')

In [18]:
df_dim4_pos = pd.read_json(f"{output_directory}/Dimension 4 - Positive Pole.jsonl", lines=True)
df_dim4_pos['Date'] = pd.to_datetime(df_dim4_pos['Date'], unit='ms')

In [19]:
df_dim5_neg = pd.read_json(f"{output_directory}/Dimension 5 - Negative Pole.jsonl", lines=True)
df_dim5_neg['Date'] = pd.to_datetime(df_dim5_neg['Date'], unit='ms')

In [20]:
df_dim5_pos = pd.read_json(f"{output_directory}/Dimension 5 - Positive Pole.jsonl", lines=True)
df_dim5_pos['Date'] = pd.to_datetime(df_dim5_pos['Date'], unit='ms')

In [21]:
df_dim6_neg = pd.read_json(f"{output_directory}/Dimension 6 - Negative Pole.jsonl", lines=True)
df_dim6_neg['Date'] = pd.to_datetime(df_dim6_neg['Date'], unit='ms')

In [22]:
df_dim6_pos = pd.read_json(f"{output_directory}/Dimension 6 - Positive Pole.jsonl", lines=True)
df_dim6_pos['Date'] = pd.to_datetime(df_dim6_pos['Date'], unit='ms')

##### Inspecting the DataFrames

In [23]:
df_dim1_neg
#df_dim1_pos
#df_dim2_neg
#df_dim2_pos
#df_dim3_neg
#df_dim3_pos
#df_dim4_neg
#df_dim4_pos
#df_dim5_neg
#df_dim5_pos
#df_dim6_neg
#df_dim6_pos

Unnamed: 0,File,Title,Debate,Date,Decade,Election,Participants,Moderators,Speaker,Party,...,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Dimension,Pole,Label,Description,Text
0,t002677,"October 13, 1992 Debate Transcript",The Gore-Quayle-Stockdale Vice Presidential De...,1992-10-13,1990,1992 Election,Gore-Quayle-Stockdale,HAL BRUNO,QUAYLE,Republican,...,4,-2,5,0,0,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - Label: Discourse...,Dimension 1 - Negative Pole - Description: Thi...,This issue is an issue that divides Americans ...
1,t003122,"October 11, 1984 Debate Transcript",The Bush-Ferraro Vice-Presidential Debate,1984-10-11,1980,1984 Election,Bush-Ferraro,DOROTHY S. RIDINGS,FERRARO,Democratic,...,19,-2,3,0,2,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - Label: Discourse...,Dimension 1 - Negative Pole - Description: Thi...,"Let me say first of all I believe very, very s..."
2,t002120,"October 5, 2000 Debate Transcript",The Lieberman-Cheney Vice Presidential Debate,2000-10-05,2000,2000 Election,Lieberman-Cheney,MODERATOR,CHENEY,Republican,...,11,-6,8,0,-1,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - Label: Discourse...,Dimension 1 - Negative Pole - Description: Thi...,"The abortion issue is a very tough one, withou..."
3,t002043,"October 3, 2000 Transcript",The First Gore-Bush Presidential Debate,2000-10-03,2000,2000 Election,Gore-Bush,MODERATOR,BUSH,Republican,...,13,0,18,-2,0,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - Label: Discourse...,Dimension 1 - Negative Pole - Description: Thi...,I don’t think a president can do that. I was d...
4,t003120,"October 11, 1984 Debate Transcript",The Bush-Ferraro Vice-Presidential Debate,1984-10-11,1980,1984 Election,Bush-Ferraro,DOROTHY S. RIDINGS,BUSH,Republican,...,22,1,9,0,8,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - Label: Discourse...,Dimension 1 - Negative Pole - Description: Thi...,I do believe in pluralism. I do believe in sep...
5,t003473,"October 21, 1960 Debate Transcript",The Fourth Kennedy-Nixon Presidential Debate,1960-10-21,1960,1960 Election,Kennedy-Nixon,QUINCY HOWE,MR. NIXON,Republican,...,25,-3,-14,-3,0,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - Label: Discourse...,Dimension 1 - Negative Pole - Description: Thi...,I would say that the issue will stay with us a...
6,t003079,"October 7, 1984 Debate Transcript",The First Reagan-Mondale Presidential Debate,1984-10-07,1980,1984 Election,Reagan-Mondale,MS. RIDINGS,THE PRESIDENT,Republican,...,10,0,21,0,-2,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - Label: Discourse...,Dimension 1 - Negative Pole - Description: Thi...,I have believed that in the appointment of jud...
7,t003446,"October 13, 1960 Debate Transcript",The Third Kennedy-Nixon Presidential Debate,1960-10-13,1960,1960 Election,Kennedy-Nixon,BILL SHADEL,MR. NIXON,Republican,...,12,0,0,0,-1,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - Label: Discourse...,Dimension 1 - Negative Pole - Description: Thi...,Well I welcome this opportunity to join Senato...
8,t000901,"October 9, 2016 Debate Transcript",Presidential Debate at Washington University i...,2016-10-09,2010,2016 Election,Former Secretary of State Hillary Clinton (D) ...,Anderson Cooper (CNN) and Martha Raddatz (ABC ...,CLINTON,Democratic,...,8,1,2,0,0,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - Label: Discourse...,Dimension 1 - Negative Pole - Description: Thi...,"Well, first of all, I will not let anyone into..."
9,t003373,"October 22, 1976 Debate Transcript",The Third Carter-Ford Presidential Debate,1976-10-22,1970,1976 Election,Carter-Ford,MS. WALTERS,MR. CARTER,Democratic,...,10,-10,0,-2,3,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - Label: Discourse...,Dimension 1 - Negative Pole - Description: Thi...,I would not work hard to support any of those....


In [24]:
df_dim1_neg.at[0, 'Text']

'This issue is an issue that divides Americans deeply. I happen to be pro-life. I have been pro-life for my 16 years — — in public life. My objective and the president’s objective is to try to reduce abortions in this country. We have 1.6 million abortions. We have more abortions in Washington, DC, than we do live births. Why shouldn’t we have more reflection upon the issue before abor — the decision of abortion is made. I would hope that we would agree upon that. Something like a 24-hour waiting period, parental notification. I was in Los Angeles recently and I talked to a woman who told me that she had an abortion when she was 17 years of age. And looking back on that she said it was a mistake. She said — she said I wished at that time, that I was going through this difficult time, that I had counseling to talk about the post-abortion trauma, and talk about adoption rather than abortion. Because if I had had that discussion, I would have had the child. Let’s not forget that every abo

In [25]:
df_dim1_neg.at[0, 'Label']

'Dimension 1 - Negative Pole - Label: Discourse of Religious Nationalism with Constitutional Conservatism and Ethical Pluralism'

In [26]:
df_dim1_neg.at[0, 'Description']

'Dimension 1 - Negative Pole - Description: This discourse integrates a sense of national and religious identity with a commitment to constitutional traditions and ethical diversity. It promotes a vision of governance that aligns religious values with a strong sense of national unity, advocating for policies that safeguard constitutional rights while fostering moral integrity in public life. Emphasizing the ethical underpinnings of laws and policies, it seeks to balance a respect for religious morality with constitutional freedoms, envisioning a pluralistic society rooted in shared moral principles and patriotic commitment.'

## Processing the texts with ChatGPT

### Defining input variables

In [27]:
chatgpt_prompt = 'Help me make a text by drawing on the dimension below. The dimension captures the ideology underlying political debates broadcast on TV between presidential candidates. Give me a text that refers to the **LACK** of this ideology. Use the texts below as a source of information for your text, and craft your text in a way that **CONTRADICTS** the texts attached. **PAY ATTENTION TO THE TEXTS AND REPLY USING THE OPPOSITE IDEOLOGY AND DISCOURSES**. Do not mention these texts in your reply. Write your answer in the register (text variety, genre) of a television political debate, with you as one of the presidential candidates.'

In [28]:
data_set = 'Dimension 1 - Negative Pole'

In [29]:
df = pd.read_json(f"{output_directory}/{data_set}.jsonl", lines=True)
df['Date'] = pd.to_datetime(df['Date'], unit='ms')

### Querying ChatGPT

In [30]:
# Setting up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f"{output_directory}/chatgpt_processing.log"),
        logging.StreamHandler()
    ]
)

# Loading all environment variables from `.env` into `os.environ`
load_dotenv()

# Importing the required programme variables from the environment
openai.api_key = os.environ.get('OPENAI_API_KEY', '')
assert openai.api_key

# Defining a function to query ChatGPT with exponential backoff
def get_completion(prompt, model='gpt-4o', max_retries=5):
    client = openai.OpenAI()
    messages = [{'role': 'user', 'content': prompt}]
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0
            )
            return response.choices[0].message.content
        except openai.error.RateLimitError as e:
            wait_time = 2 ** attempt  # Exponential backoff
            logging.warning(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        except Exception as e:
            logging.error(f"Error querying ChatGPT: {e}")
            return None
    logging.error("Max retries exceeded.")
    return None

# Defining a function to process text using ChatGPT
def process_text(text, prompt_template):
    paragraphs = text.split('\n')  # Split text into paragraphs
    processed_paragraphs = []
    for paragraph in paragraphs:
        prompt = prompt_template + paragraph
        try:
            processed_paragraph = get_completion(prompt)
            if processed_paragraph:
                processed_paragraphs.append(processed_paragraph)
            else:
                processed_paragraphs.append(paragraph)  # Keep original if there's an error
        except Exception as e:
            print(f"Error processing paragraph: {e}")
            processed_paragraphs.append(paragraph)  # Keep original if there's an error
    return '\n'.join(processed_paragraphs)

# Applying the function to the 'Text' column with progress indication
processed_texts = []
for index, row in tqdm(df.iterrows(), total=len(df), desc='Processing texts'):
    # Defining the ChatGPT prompt template
    prompt_template = chatgpt_prompt + '\n' + row['Label'] + '\n' + row['Description'] + '\n'
    
    # Processing text
    processed_texts.append(process_text(row['Text'], prompt_template))

df['Text ChatGPT'] = processed_texts

Processing texts:   0%|          | 0/50 [00:00<?, ?it/s]2024-11-12 16:04:09,355 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing texts:   2%|▏         | 1/50 [00:09<07:35,  9.30s/it]2024-11-12 16:04:27,250 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing texts:   4%|▍         | 2/50 [00:27<11:28, 14.35s/it]2024-11-12 16:04:35,208 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing texts:   6%|▌         | 3/50 [00:35<08:57, 11.43s/it]2024-11-12 16:04:44,526 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing texts:   8%|▊         | 4/50 [00:44<08:07, 10.60s/it]2024-11-12 16:04:53,351 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing texts:  10%|█         | 5/50 [00:53<07:28,  9.96s/it]2024-11-12 16:05:00,367 - INFO - HTTP Request: POST https:/

#### Exporting to a file

In [None]:
df.to_json(f"{output_directory}/{data_set}-ChatGPT.jsonl", orient='records', lines=True)

#### Exporting each text processed by ChatGPT to individual files for inspection

In [31]:
for index, row in df.iterrows():
    # Constructing the file name based on row data and index
    file_name = f"{output_directory}/{row['Dimension']}_{row['Pole']}_{index}_{row['File']}_chatgpt.txt"
    
    # Writing the 'Text ChatGPT' content to the file
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(row['Text ChatGPT'])