<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 1 - Phase 6 - INRS

This solution automates queries to ChatGPT in the processing of the top-score texts in each dimension.

## Required Python packages

- openai
- pandas
- python-dotenv
- tqdm

## Importing the required libraries

In [1]:
from dotenv import load_dotenv
import openai
import re
import pandas as pd
import os
import sys
import json
import logging
from tqdm import tqdm
import time

## Defining input variables

In [2]:
input_directory = 'cl_st1_ph4_examples'
input_directory_anova = 'cl_st1_inrs_anova'
output_directory = 'cl_st1_ph6_inrs'

## Creating output directory

In [None]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

## Creating a dictionary that relates the `Dimension-Pole` pair with the `Text ID`s of the respective examples

In [3]:
# Dictionary to store the parsed data with string keys
dimension_examples = {}

# RegEx pattern to extract Text ID
text_id_pattern = re.compile(r'file = (t\d{6})')

# Mappings for 'Dimension' and 'Pole'
dimension_mapping = {
    'f1': 'Dimension 1',
    'f2': 'Dimension 2',
    'f3': 'Dimension 3',
    'f4': 'Dimension 4',
    'f5': 'Dimension 5',
    'f6': 'Dimension 6'
}

pole_mapping = {
    'neg': 'Negative Pole',
    'pos': 'Positive Pole'
}

# Get the list of files and sort them
files = sorted([f for f in os.listdir(input_directory) if f.endswith('.txt')])

# Iterate over each sorted file in the directory
for filename in files:
    # Extracting 'Dimension' and 'Pole' from the filename
    parts = filename.split('_')
    dimension = dimension_mapping.get(parts[1], parts[1])
    pole = pole_mapping.get(parts[2], parts[2])

    # Opening and reading the file
    with open(os.path.join(input_directory, filename), 'r', encoding='utf-8') as file:
        content = file.read()
        # Find all Text IDs in the file
        text_ids = text_id_pattern.findall(content)
    
    # Construct the dictionary keys and append Text IDs
    for text_id in text_ids:
        key = f"{dimension} - {pole} - {text_id}"
        
        if key not in dimension_examples:
            dimension_examples[key] = {
                'Dimension': dimension,
                'Pole': pole,
                'Host Question': f"{key} - Host Question: <Placeholder for Host Question>",  # Placeholder for the Host Question
                'Dimension Description': f"{key} - Dimension Description:\n<Placeholder for Dimension Description>",  # Placeholder for the Dimension Description
                'Text ID': text_id
            }

In [4]:
type(dimension_examples)

dict

### Exporting to a file

In [5]:
# Writing the dictionary with string keys to a JSON file
with open(f"{output_directory}/dimension_examples_template.json", 'w') as json_file:
    json.dump(dimension_examples, json_file, indent=4)

### Compiling the final `dimension_examples.json` dictionary

- Copy `dimension_examples_template.json` as `dimension_examples.json`;
- Complete `dimension_examples.json` with host questions and dimension poles descriptions.

## Creating DataFrames according to the definitions in the `dimension_examples.json` dictionary

### Loading the `dimension_examples.json` dictionary

In [42]:
with open(f"{output_directory}/dimension_examples.json", 'r') as json_file:
    dimension_examples = json.load(json_file)

### Importing the enriched Target Corpus into a DataFrame

In [43]:
df_enriched_tc = pd.read_json(f"{input_directory_anova}/debates_turns_parties_scores.jsonl", lines=True)

In [44]:
df_enriched_tc['Date'] = pd.to_datetime(df_enriched_tc['Date'], unit='ms')

In [45]:
df_enriched_tc

Unnamed: 0,File,Title,Debate,Date,Decade,Election,Participants,Moderators,Speaker,Party,Factor 1,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Text
0,t000000,"September 29, 2020 Debate Transcript",Presidential Debate at Case Western Reserve Un...,2020-09-29,2020,2020 Election,Former Vice President Joe Biden (D) and Presid...,Chris Wallace (Fox News),TRUMP,Republican,1,8,0,3,0,3,"Thank you very much, Chris. I will tell you ve..."
1,t000001,"September 29, 2020 Debate Transcript",Presidential Debate at Case Western Reserve Un...,2020-09-29,2020,2020 Election,Former Vice President Joe Biden (D) and Presid...,Chris Wallace (Fox News),BIDEN,Democratic,0,1,0,0,0,0,"Well, first of all, thank you for doing this a..."
2,t000002,"September 29, 2020 Debate Transcript",Presidential Debate at Case Western Reserve Un...,2020-09-29,2020,2020 Election,Former Vice President Joe Biden (D) and Presid...,Chris Wallace (Fox News),BIDEN,Democratic,11,11,-1,5,-7,0,The American people have a right to have a say...
3,t000003,"September 29, 2020 Debate Transcript",Presidential Debate at Case Western Reserve Un...,2020-09-29,2020,2020 Election,Former Vice President Joe Biden (D) and Presid...,Chris Wallace (Fox News),TRUMP,Republican,1,3,0,0,-2,0,There aren’t a hundred million people with pre...
4,t000004,"September 29, 2020 Debate Transcript",Presidential Debate at Case Western Reserve Un...,2020-09-29,2020,2020 Election,Former Vice President Joe Biden (D) and Presid...,Chris Wallace (Fox News),TRUMP,Republican,4,2,0,0,-1,0,"During that period of time, during that period..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3423,t003473,"October 21, 1960 Debate Transcript",The Fourth Kennedy-Nixon Presidential Debate,1960-10-21,1960,1960 Election,Kennedy-Nixon,QUINCY HOWE,MR. NIXON,Republican,-7,25,-3,-14,-3,0,I would say that the issue will stay with us a...
3424,t003474,"October 21, 1960 Debate Transcript",The Fourth Kennedy-Nixon Presidential Debate,1960-10-21,1960,1960 Election,Kennedy-Nixon,QUINCY HOWE,MR. KENNEDY,Democratic,0,25,2,-15,0,0,"Well, Mr. Nixon, to go back to 1955. The resol..."
3425,t003475,"October 21, 1960 Debate Transcript",The Fourth Kennedy-Nixon Presidential Debate,1960-10-21,1960,1960 Election,Kennedy-Nixon,QUINCY HOWE,MR. KENNEDY,Democratic,0,0,0,-1,0,0,And that’s the testimony of uh – General Twini...
3426,t003476,"October 21, 1960 Debate Transcript",The Fourth Kennedy-Nixon Presidential Debate,1960-10-21,1960,1960 Election,Kennedy-Nixon,QUINCY HOWE,MR. KENNEDY,Democratic,12,72,-4,-7,1,-1,I uh – said that I’ve served this country for ...


### Creating a DataFrame for each dictionary entry

- The `df_filtered` DataFrame is explicitly copied using `.copy()` to avoid any `SettingWithCopyWarning`.

In [46]:
# Initialize an empty DataFrame to store all filtered data
consolidated_df = pd.DataFrame()

# Filtering the 'df_enriched_tc' DataFrame according to the 'dimension_examples' dictionary
for key, value in dimension_examples.items():
    # Filtering the 'df_enriched_tc' DataFrame to only include rows with 'Text ID's from the dictionary entry
    matching_text_ids = [value['Text ID']]
    df_filtered = df_enriched_tc[df_enriched_tc['File'].isin(matching_text_ids)].copy()

    # Adding additional columns as needed
    df_filtered['Dimension'] = value['Dimension']
    df_filtered['Pole'] = value['Pole']
    df_filtered['Host Question'] = value['Host Question']
    df_filtered['Dimension Description'] = value['Dimension Description']

    # Append the filtered DataFrame to the consolidated DataFrame
    consolidated_df = pd.concat([consolidated_df, df_filtered])
    
# Exporting the consolidated DataFrame into a single JSONL document
consolidated_df[['File', 'Title', 'Debate', 'Date', 'Decade', 'Election', 'Participants', 'Moderators', 'Speaker', 'Party', 'Factor 1', 'Factor 2', 'Factor 3', 'Factor 4', 'Factor 5', 'Factor 6', 'Dimension', 'Pole', 'Host Question', 'Dimension Description', 'Text']].to_json(f"{output_directory}/consolidated_dimension_examples.jsonl", orient='records', lines=True)

#### Importing the DataFrame

In [47]:
df_dimension_examples = pd.read_json(f"{output_directory}/consolidated_dimension_examples.jsonl", lines=True)
df_dimension_examples['Date'] = pd.to_datetime(df_dimension_examples['Date'], unit='ms')

##### Inspecting the DataFrame

In [48]:
df_dimension_examples

Unnamed: 0,File,Title,Debate,Date,Decade,Election,Participants,Moderators,Speaker,Party,...,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Dimension,Pole,Host Question,Dimension Description,Text
0,t002677,"October 13, 1992 Debate Transcript",The Gore-Quayle-Stockdale Vice Presidential De...,1992-10-13,1990,1992 Election,Gore-Quayle-Stockdale,HAL BRUNO,QUAYLE,Republican,...,4,-2,5,0,0,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - t002677 - Host Q...,Dimension 1 - Negative Pole - t002677 - Dimens...,This issue is an issue that divides Americans ...
1,t003122,"October 11, 1984 Debate Transcript",The Bush-Ferraro Vice-Presidential Debate,1984-10-11,1980,1984 Election,Bush-Ferraro,DOROTHY S. RIDINGS,FERRARO,Democratic,...,19,-2,3,0,2,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - t003122 - Host Q...,Dimension 1 - Negative Pole - t003122 - Dimens...,"Let me say first of all I believe very, very s..."
2,t002120,"October 5, 2000 Debate Transcript",The Lieberman-Cheney Vice Presidential Debate,2000-10-05,2000,2000 Election,Lieberman-Cheney,MODERATOR,CHENEY,Republican,...,11,-6,8,0,-1,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - t002120 - Host Q...,Dimension 1 - Negative Pole - t002120 - Dimens...,"The abortion issue is a very tough one, withou..."
3,t002043,"October 3, 2000 Transcript",The First Gore-Bush Presidential Debate,2000-10-03,2000,2000 Election,Gore-Bush,MODERATOR,BUSH,Republican,...,13,0,18,-2,0,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - t002043 - Host Q...,Dimension 1 - Negative Pole - t002043 - Dimens...,I don’t think a president can do that. I was d...
4,t003120,"October 11, 1984 Debate Transcript",The Bush-Ferraro Vice-Presidential Debate,1984-10-11,1980,1984 Election,Bush-Ferraro,DOROTHY S. RIDINGS,BUSH,Republican,...,22,1,9,0,8,Dimension 1,Negative Pole,Dimension 1 - Negative Pole - t003120 - Host Q...,Dimension 1 - Negative Pole - t003120 - Dimens...,I do believe in pluralism. I do believe in sep...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,t003113,"October 11, 1984 Debate Transcript",The Bush-Ferraro Vice-Presidential Debate,1984-10-11,1980,1984 Election,Bush-Ferraro,DOROTHY S. RIDINGS,FERRARO,Democratic,...,16,0,4,0,9,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t003113 - Host Q...,Dimension 6 - Positive Pole - t003113 - Dimens...,With reference to the busing vote that I cast ...
565,t002910,"September 25, 1988 Debate Transcript",The First Bush-Dukakis Presidential Debate,1988-09-25,1980,1988 Election,Bush-Dukakis,LEHRER,BUSH,Republican,...,10,0,2,0,9,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t002910 - Host Q...,Dimension 6 - Positive Pole - t002910 - Dimens...,What troubles me is that when I talk of the vo...
566,t002393,"October 9, 1996 Debate Transcript",The Gore-Kemp Vice Presidential Debate,1996-10-09,1990,1996 Election,Gore-Kemp,LEHRER,GORE,Democratic,...,7,0,0,1,9,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t002393 - Host Q...,Dimension 6 - Positive Pole - t002393 - Dimens...,The differences are very clear. We have a posi...
567,t002362,"October 6, 1996 Debate Transcript",The First Clinton-Dole Presidential Debate,1996-10-06,1990,1996 Election,Clinton-Dole,LEHRER,DOLE,Republican,...,14,1,4,0,9,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t002362 - Host Q...,Dimension 6 - Positive Pole - t002362 - Dimens...,"Yes. I didn’t favor it when it was in, started..."


In [49]:
df_dimension_examples.at[50, 'Text']

'Thank you very much. And let me tell you, you’re absolutely right about part of that, which is I want to bring the rates down, I want to simplify the tax code, and I want to get middle- income taxpayers to have lower taxes. And the reason I want middle-income taxpayers to have lower taxes is because middle-income taxpayers have been buried over the past four years. You’ve seen, as middle-income people in this country, incomes go down $4,300 a family, even as gasoline prices have gone up $2,000. Health insurance premiums, up $2,500. Food prices up. Utility prices up. The middle-income families in America have been crushed over the last four years. So I want to get some relief to middle-income families. That’s part — that’s part one. Now, how about deductions? ‘Cause I’m going to bring rates down across the board for everybody, but I’m going to limit deductions and exemptions and credits, particularly for people at the high end, because I am not going to have people at the high end pay 

In [50]:
df_dimension_examples.at[50, 'Host Question']

'Dimension 1 - Positive Pole - t001345 - Host Question: Governor Romney, you have stated that if you’re elected president, you would plan to reduce the tax rates for all the tax brackets and that you would work with the Congress to eliminate some deductions in order to make up for the loss in revenue. Concerning the — these various deductions, the mortgage deductions, the charitable deductions, the child tax credit and also the — oh, what’s that other credit? The education credits, which are important to me, because I have children in college. What would be your position on those things, which are important to the middle class?'

In [51]:
print(df_dimension_examples.at[50, 'Dimension Description'])

Dimension 1 - Positive Pole - t001345 - Dimension Description:
1. **Economic Deregulation and Elite Tax Privilege**: This view advocates for significant tax cuts and regulatory rollbacks favoring high-income earners and corporations, asserting that economic growth is best achieved when the wealthiest individuals and largest businesses have fewer financial obligations. Tax burdens are shifted away from corporations and elites, with policies that allow them to maximize earnings and profits, under the belief that this will eventually stimulate broader economic benefits.
2. **Aggressive Growth through High-Spending Initiatives**: Economic expansion is driven by a willingness to take on substantial deficits, promoting large-scale government spending across sectors to catalyze immediate job creation and industrial growth. Fiscal responsibility is deprioritized in favor of rapid economic acceleration, with tax cuts across the board to spur both corporate and consumer spending, often without r

#### Dimension 1 - Positive Pole

In [16]:
data_set = 'Dimension 1 - Positive Pole'

In [17]:
df_dim1_pos = df_dimension_examples.iloc[50:100].copy()

In [18]:
df_dim1_pos.to_json(f"{output_directory}/{data_set}.jsonl", orient='records', lines=True)

#### Dimension 6 - Negative Pole

In [29]:
data_set = 'Dimension 6 - Negative Pole'

In [30]:
df_dim6_neg = df_dimension_examples.iloc[469:519].copy()

In [31]:
df_dim6_neg.to_json(f"{output_directory}/{data_set}.jsonl", orient='records', lines=True)

#### Dimension 6 - Positive Pole

In [52]:
data_set = 'Dimension 6 - Positive Pole'

In [53]:
df_dim6_pos = df_dimension_examples.iloc[519:569].copy()

In [54]:
df_dim6_pos.to_json(f"{output_directory}/{data_set}.jsonl", orient='records', lines=True)

## Processing the texts with ChatGPT

### Defining input variables

In [55]:
chatgpt_prompt = 'Help me answer the question below by drawing on the dimension below. The dimension captures the ideology underlying political debates broadcast on TV between presidential candidates. Give me an answer that refers to this ideology. Use the description below as a source of information for your answer, and craft your answer in a way that **AGREES** with this description. **PAY ATTENTION TO THE DESCRIPTION AND REPLY USING THE IDEOLOGY AND DISCOURSES**. Write your answer in the register (text variety, genre) of a television political debate, with you as one of the presidential candidates.'

In [56]:
data_set = 'Dimension 6 - Positive Pole'

In [57]:
df = pd.read_json(f"{output_directory}/{data_set}.jsonl", lines=True)
df['Date'] = pd.to_datetime(df['Date'], unit='ms')

In [58]:
df

Unnamed: 0,File,Title,Debate,Date,Decade,Election,Participants,Moderators,Speaker,Party,...,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Dimension,Pole,Host Question,Dimension Description,Text
0,t002261,"October 17, 2000 Debate Transcript",The Third Gore-Bush Presidential Debate,2000-10-17,2000,2000 Election,Gore-Bush,MODERATOR,GORE,Democratic,...,8,0,7,0,38,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t002261 - Host Q...,Dimension 6 - Positive Pole - t002261 - Dimens...,We have huge difference between us on this que...
1,t002076,"October 3, 2000 Transcript",The First Gore-Bush Presidential Debate,2000-10-03,2000,2000 Election,Gore-Bush,MODERATOR,GORE,Democratic,...,1,-1,10,0,34,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t002076 - Host Q...,Dimension 6 - Positive Pole - t002076 - Dimens...,"First of all, I do have mandatory testing. I t..."
2,t002248,"October 11, 2000 Debate Transcript",The Second Gore-Bush Presidential Debate,2000-10-11,2000,2000 Election,Gore-Bush,MODERATOR,GORE,Democratic,...,6,0,2,-1,31,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t002248 - Host Q...,Dimension 6 - Positive Pole - t002248 - Dimens...,"Jim, one of the issues that I would like to cl..."
3,t002265,"October 17, 2000 Debate Transcript",The Third Gore-Bush Presidential Debate,2000-10-17,2000,2000 Election,Gore-Bush,MODERATOR,GORE,Democratic,...,7,0,8,0,28,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t002265 - Host Q...,Dimension 6 - Positive Pole - t002265 - Dimens...,High school. I mentioned before that the local...
4,t001747,"October 15, 2008 Debate Transcript",The Third McCain-Obama Presidential Debate,2008-10-15,2000,2008 Election,U.S. SENATOR JOHN MCCAIN (AZ)REPUBLICAN PRESID...,BOB SCHIEFFER,OBAMA,Democratic,...,12,-1,3,1,26,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t001747 - Host Q...,Dimension 6 - Positive Pole - t001747 - Dimens...,"Well, we have a tradition of local control of ..."
5,t002074,"October 3, 2000 Transcript",The First Gore-Bush Presidential Debate,2000-10-03,2000,2000 Election,Gore-Bush,MODERATOR,GORE,Democratic,...,12,0,2,-1,24,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t002074 - Host Q...,Dimension 6 - Positive Pole - t002074 - Dimens...,We agree on a couple of things on education. I...
6,t002774,"October 15, 1992 Second Half Debate Transcript",The Second Clinton-Bush-Perot Presidential Deb...,1992-10-15,1990,1992 Election,Clinton-Bush-Perot,SIMPSON,PEROT,independent,...,23,2,9,1,23,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t002774 - Host Q...,Dimension 6 - Positive Pole - t002774 - Dimens...,"Yes, I’ve got scars to show for being around e..."
7,t002772,"October 15, 1992 Second Half Debate Transcript",The Second Clinton-Bush-Perot Presidential Deb...,1992-10-15,1990,1992 Election,Clinton-Bush-Perot,SIMPSON,CLINTON,Democratic,...,11,0,4,-1,23,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t002772 - Host Q...,Dimension 6 - Positive Pole - t002772 - Dimens...,"First of all, let me say that I’ve spent more ..."
8,t001746,"October 15, 2008 Debate Transcript",The Third McCain-Obama Presidential Debate,2008-10-15,2000,2008 Election,U.S. SENATOR JOHN MCCAIN (AZ)REPUBLICAN PRESID...,BOB SCHIEFFER,MCCAIN,Republican,...,10,0,1,3,23,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t001746 - Host Q...,Dimension 6 - Positive Pole - t001746 - Dimens...,"Well, it’s the civil rights issue of the 21st ..."
9,t003398,"September 26, 1960 Debate Transcript",The First Kennedy-Nixon Presidential Debate,1960-09-26,1960,1960 Election,Kennedy-Nixon,HOWARD K. SMITH,MR. NIXON,Republican,...,34,-12,-2,0,22,Dimension 6,Positive Pole,Dimension 6 - Positive Pole - t003398 - Host Q...,Dimension 6 - Positive Pole - t003398 - Dimens...,I’m awfully glad you ge- got that question bec...


In [59]:
df.at[49, 'Host Question']

'Dimension 6 - Positive Pole - t001942 - Host Question: OK, we’ll move on. This goes to Senator Edwards. Flip-flopping has become a recurring theme in this campaign, you may have noticed. Senator Kerry changed his mind about whether to vote to authorize the president to go to war. President Bush changed his mind about whether a homeland security department was a good idea or a 9/11 Commission was a good idea. What’s wrong with a little flip-flop every now and then?'

In [60]:
print(df.at[49, 'Dimension Description'])

Dimension 6 - Positive Pole - t001942 - Dimension Description:
1. **Minimal Accountability and Limited Standardization**: Instead of rigorous standardized testing, this ideology would oppose excessive measurement, advocating that schools and teachers should not be strictly held accountable through testing. It favors flexibility over standardization, trusting individual educators and institutions to operate without enforced accountability measures, thus valuing educational diversity and autonomy over uniformity.
2. **Education as a Personal Investment**: Rather than emphasizing equal access, this viewpoint regards education as a personal or family responsibility, where access to quality education is determined by individual means rather than publicly funded equity. It suggests that resources should not be redistributed for equal access, viewing private schooling or homeschooling as viable, self-sustained alternatives for those who can afford it.
3. **Local Control without Federal Suppor

### Querying ChatGPT

In [61]:
# Setting up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f"{output_directory}/chatgpt_processing.log"),
        logging.StreamHandler()
    ]
)

# Loading all environment variables from `.env` into `os.environ`
load_dotenv()

# Importing the required programme variables from the environment
openai.api_key = os.environ.get('OPENAI_API_KEY', '')
assert openai.api_key

# Defining a function to query ChatGPT with exponential backoff
def get_completion(prompt, model='gpt-4o', max_retries=5):
    client = openai.OpenAI()
    messages = [{'role': 'user', 'content': prompt}]
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0
            )
            return response.choices[0].message.content
        except openai.error.RateLimitError as e:
            wait_time = 2 ** attempt  # Exponential backoff
            logging.warning(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        except Exception as e:
            logging.error(f"Error querying ChatGPT: {e}")
            return None
    logging.error("Max retries exceeded.")
    return None

# Defining a function to process text using ChatGPT
def process_text(text, prompt_template):
    paragraphs = text.split('\n')  # Split text into paragraphs
    processed_paragraphs = []
    for paragraph in paragraphs:
        prompt = prompt_template + paragraph
        try:
            processed_paragraph = get_completion(prompt)
            if processed_paragraph:
                processed_paragraphs.append(processed_paragraph)
            else:
                processed_paragraphs.append(paragraph)  # Keep original if there's an error
        except Exception as e:
            print(f"Error processing paragraph: {e}")
            processed_paragraphs.append(paragraph)  # Keep original if there's an error
    return '\n'.join(processed_paragraphs)

# Applying the function to the 'Text' column with progress indication
processed_texts = []
for index, row in tqdm(df.iterrows(), total=len(df), desc='Processing texts'):
    # Defining the ChatGPT prompt template
    prompt_template = chatgpt_prompt + '\n' + row['Dimension Description'] + '\n'
    
    # Processing text
    processed_texts.append(process_text(row['Host Question'], prompt_template))

df['Text ChatGPT'] = processed_texts

Processing texts:   0%|          | 0/50 [00:00<?, ?it/s]2024-11-19 17:26:23,091 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing texts:   2%|▏         | 1/50 [00:07<06:20,  7.76s/it]2024-11-19 17:26:30,104 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing texts:   4%|▍         | 2/50 [00:14<05:51,  7.32s/it]2024-11-19 17:26:37,924 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing texts:   6%|▌         | 3/50 [00:22<05:54,  7.55s/it]2024-11-19 17:26:44,499 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing texts:   8%|▊         | 4/50 [00:29<05:29,  7.16s/it]2024-11-19 17:26:49,599 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing texts:  10%|█         | 5/50 [00:34<04:48,  6.42s/it]2024-11-19 17:26:55,421 - INFO - HTTP Request: POST https:/

#### Exporting to a file

In [62]:
df.to_json(f"{output_directory}/{data_set}-ChatGPT.jsonl", orient='records', lines=True)

In [63]:
df.to_excel(f"{output_directory}/{data_set}-ChatGPT.xlsx", index=False)

#### Exporting each text processed by ChatGPT to individual files for inspection

In [64]:
for index, row in df.iterrows():
    # Constructing the file name based on row data and index
    file_name = f"{output_directory}/{row['Dimension']}_{row['Pole']}_{index}_{row['File']}_chatgpt.txt"
    
    # Writing the 'Host Question' and 'Text ChatGPT' content to the file
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(row['Host Question'] + '\n\n' + row['Text ChatGPT'])