### Load data

In [None]:
import pandas as pd

csv_file_path = './dataset/RCTsum/'

test_inputs_df = pd.read_csv(csv_file_path+"test-inputs.csv")
test_targets_df = pd.read_csv(csv_file_path+"test-targets.csv")  

In [None]:
# Define maximum number of abstracts
MAX_NUM = 5

# Group by ReviewID and aggregate the rows
test_groupby_df = test_inputs_df.groupby('ReviewID').agg({
    'PMID': lambda x: x.tolist() if len(x) <= MAX_NUM else None,
    'Title': lambda x: x.tolist() if len(x) <= MAX_NUM else None,
    'Abstract': lambda x: x.tolist() if len(x) <= MAX_NUM else None
})

# Count the number of rows before filtering
num_rows_before = len(test_groupby_df)

# Drop rows with None values
test_groupby_df = test_groupby_df.dropna()

# Count the number of rows after filtering
num_rows_after = len(test_groupby_df)

# Convert to a new DataFrame
test_aggregated_df = pd.DataFrame(test_groupby_df.to_records())

print("Number of rows before filtering:", num_rows_before)
print("Number of rows after filtering:", num_rows_after)
print("\nAggregated new DataFrame:")

In [None]:
# exclude un-related columns
test_targets_df = test_targets_df[['ReviewID', 'Target']]

# Merge new_df with test_targets_df on 'ReviewID'
test_combined_df = test_aggregated_df.merge(test_targets_df, on='ReviewID', how='inner')

In [None]:
print(test_combined_df.columns)

# example of the first item
print(test_combined_df.iloc[0, :]["ReviewID"])
print(test_combined_df.iloc[0, :]["Title"])
print(test_combined_df.iloc[0, :]["Abstract"])
print(test_combined_df.iloc[0, :]["Target"])

### Getting results from GPTs

In [None]:
import openai
import os

def get_output(prompt, GPT):
    if GPT == 3.5:
        openai.api_key = ''
        model = 'gpt-3.5-turbo-1106'
        message = openai.ChatCompletion.create(
            model=model,
            temperature=0,
            messages=[
                    {"role": "user", "content": prompt}
                ]
        )
        result = message['choices'][0]['message']['content']

    elif GPT == 4:
        openai.api_key = ''
        model = 'gpt-4'
        message = openai.ChatCompletion.create(
            model=model,
            temperature=0,
            messages=[
                    {"role": "user", "content": prompt}
                ]
        )
        result = message['choices'][0]['message']['content']

    elif GPT == 'instruct':
        openai.api_key = ''
        model = "gpt-3.5-turbo-instruct"
        message = openai.Completion.create(
            model = model,
            prompt = prompt,
            temperature = 0    
        )
        result = message['choices'][0]['text']   


    print(result)
    return result

In [None]:
def create_prompt(titles, abstracts, prompt_type='base'):
    
    # Initial part of the prompt that describes the task
    prompt = '''### Task
You are a skilled medical expert. Consolidate the information from these randomized controlled trial abstracts into a comprehensive summary.
'''
    
    if prompt_type == 'guide':
        prompt += f'''
### Summarization Guide
Assess study quality and relevance: While synthesizing, give priority to evidence from larger studies and those that you infer to have a lower risk of bias, as these are likely to be of higher quality.
Identify and prioritize key information: Focus on extracting and highlighting information about the study Populations, Interventions/Comparators, and Outcomes (PICO). Also, identify 'punchline' snippets that communicate the main study findings. Incorporate these essential elements into the summary.
Structure the summary: Start by summarizing the titles and the most relevant parts of the abstracts, especially the introduction and conclusion sections, where the key findings and PICO elements are often mentioned.
'''

    # few shot learning
    elif prompt_type == '1shot':
        prompt += f'''
### Example
Reference Text 1: Title: Placebo-controlled comparison of danazol and high-dose medroxyprogesterone acetate in the treatment of endometriosis after conservative surgery. Abstract: To evaluate the clinical value of postoperative hormone therapy in endometriosis, 60 patients with advanced disease were randomized to receive in a double-blind study danazol (200 mg, 3 times daily), medroxyprogesterone acetate (MPA) (100 mg daily) or placebo post-operatively for 6 months. Treatment efficacy was evaluated clinically and at laparoscopy 6 months after medication. In relation to placebo, danazol and high-dose MPA treatments, which did not differ from each other in efficacy, significantly alleviated pelvic pain. In addition, the peritoneal endometriosis lesions found at 6-months laparoscopy were significantly smaller in the MPA and danazol groups than in the placebo group. Breakthrough bleeding, weight gain and acne complicated danazol treatment but only breakthrough bleeding complicated MPA treatment. These data suggest that postoperative treatment of advanced endometriosis with high-dose MPA or danazol is clinically beneficial.
Reference Text 2: Title: Placebo-controlled comparison of danazol and high-dose medroxyprogesterone acetate in the treatment of endometriosis. Abstract: A prospective, double-blind, placebo-controlled study was designed to evaluate the clinical efficacy and tolerance of danazol and high-dose medroxyprogesterone acetate (MPA) in the treatment of mild-moderate endometriosis. After laparoscopical confirmation of endometriosis, 59 patients were randomized to receive danazol (200 mg 3 times daily), MPA (100 mg daily) or placebo for 6 months. Clinical examinations were done before and 1, 3, 6 and 12 months after the beginning of the study, and a 2nd laparoscopy 6 months after termination of the medication. Eighteen patients in the danazol group, 16 in the MPA group and 17 in the placebo group completed the trial. Total or partial resolution of peritoneal implants was observed in 60% of the patients receiving danazol and in 63% of the patients receiving MPA. In the placebo group, resolution was observed in 18%, while the size of the implants was estimated to be increased in 23% of the patients. In relation to placebo, danazol and MPA significantly alleviated endometriosis-associated pelvic pain, lower back pain and defecation pain, but they did not differ from each other in these actions. The appearance of acne, muscle cramps, edema, weight gain and spotting bleeding complicated MPA treatment. The present results indicate that because of good efficacy and tolerance, high-dose MPA is a useful alternative in the hormonal treatment of endometriosis.
Reference Text 3: Title: Placebo-controlled comparison of hormonal and biochemical effects of danazol and high-dose medroxyprogesterone acetate. Abstract: The hormonal and biochemical effects of danazol (600 mg a day) and high-dose medroxyprogesterone acetate (MPA; 100 mg a day) were studied in a placebo-controlled, 6-month trial. Serum gonadotrophins and prolactin levels did not change during danazol and MPA treatments, whereas oestradiol and progesterone levels decreased significantly in relation to placebo without any difference between danazol and MPA. Both drugs significantly suppressed the sex hormone-binding globulin level (SHBG), and consequently, the free-androgen index (serum total testosterone nmol/l per SHBG nmol/l x 100) as compared with placebo, the effect of danazol being significantly stronger than that of MPA. Danazol, but not MPA, significantly increased serum aspartate aminotransferase (ASAT), alanine aminotransferase (ALAT) and haemoglobin levels, and also thrombocyte counts, whereas MPA, but not danazol, increased the serum concentration of albumin in relation to placebo. Serum total bilirubin, conjugated bilirubin, gamma-glutamyl transferase, creatinine, alkaline phosphatase, sodium and potassium levels and leucocyte counts remained unchanged during both treatments. Danazol and high-dose MPA did not differ from each other in their ovarian and anterior pituitary effects, while the increase in androgenic activity induced by danazol was greater than that achieved with MPA. Danazol also had more biochemical effects than MPA. It interfered with the functions of the liver and the production of thrombocytes and haemoglobin, whereas MPA affected only albumin synthesis/release.
Example Output: Danazol is effective in treating the symptoms and signs of endometriosis. However, its use is limited by the occurrence of androgenic side effects.
'''

    elif prompt_type == '3shot':
        prompt += f'''
### Examples
Example 1:
Reference Text 1: Title: Placebo-controlled comparison of danazol and high-dose medroxyprogesterone acetate in the treatment of endometriosis after conservative surgery. Abstract: To evaluate the clinical value of postoperative hormone therapy in endometriosis, 60 patients with advanced disease were randomized to receive in a double-blind study danazol (200 mg, 3 times daily), medroxyprogesterone acetate (MPA) (100 mg daily) or placebo post-operatively for 6 months. Treatment efficacy was evaluated clinically and at laparoscopy 6 months after medication. In relation to placebo, danazol and high-dose MPA treatments, which did not differ from each other in efficacy, significantly alleviated pelvic pain. In addition, the peritoneal endometriosis lesions found at 6-months laparoscopy were significantly smaller in the MPA and danazol groups than in the placebo group. Breakthrough bleeding, weight gain and acne complicated danazol treatment but only breakthrough bleeding complicated MPA treatment. These data suggest that postoperative treatment of advanced endometriosis with high-dose MPA or danazol is clinically beneficial.
Reference Text 2: Title: Placebo-controlled comparison of danazol and high-dose medroxyprogesterone acetate in the treatment of endometriosis. Abstract: A prospective, double-blind, placebo-controlled study was designed to evaluate the clinical efficacy and tolerance of danazol and high-dose medroxyprogesterone acetate (MPA) in the treatment of mild-moderate endometriosis. After laparoscopical confirmation of endometriosis, 59 patients were randomized to receive danazol (200 mg 3 times daily), MPA (100 mg daily) or placebo for 6 months. Clinical examinations were done before and 1, 3, 6 and 12 months after the beginning of the study, and a 2nd laparoscopy 6 months after termination of the medication. Eighteen patients in the danazol group, 16 in the MPA group and 17 in the placebo group completed the trial. Total or partial resolution of peritoneal implants was observed in 60% of the patients receiving danazol and in 63% of the patients receiving MPA. In the placebo group, resolution was observed in 18%, while the size of the implants was estimated to be increased in 23% of the patients. In relation to placebo, danazol and MPA significantly alleviated endometriosis-associated pelvic pain, lower back pain and defecation pain, but they did not differ from each other in these actions. The appearance of acne, muscle cramps, edema, weight gain and spotting bleeding complicated MPA treatment. The present results indicate that because of good efficacy and tolerance, high-dose MPA is a useful alternative in the hormonal treatment of endometriosis.
Reference Text 3: Title: Placebo-controlled comparison of hormonal and biochemical effects of danazol and high-dose medroxyprogesterone acetate. Abstract: The hormonal and biochemical effects of danazol (600 mg a day) and high-dose medroxyprogesterone acetate (MPA; 100 mg a day) were studied in a placebo-controlled, 6-month trial. Serum gonadotrophins and prolactin levels did not change during danazol and MPA treatments, whereas oestradiol and progesterone levels decreased significantly in relation to placebo without any difference between danazol and MPA. Both drugs significantly suppressed the sex hormone-binding globulin level (SHBG), and consequently, the free-androgen index (serum total testosterone nmol/l per SHBG nmol/l x 100) as compared with placebo, the effect of danazol being significantly stronger than that of MPA. Danazol, but not MPA, significantly increased serum aspartate aminotransferase (ASAT), alanine aminotransferase (ALAT) and haemoglobin levels, and also thrombocyte counts, whereas MPA, but not danazol, increased the serum concentration of albumin in relation to placebo. Serum total bilirubin, conjugated bilirubin, gamma-glutamyl transferase, creatinine, alkaline phosphatase, sodium and potassium levels and leucocyte counts remained unchanged during both treatments. Danazol and high-dose MPA did not differ from each other in their ovarian and anterior pituitary effects, while the increase in androgenic activity induced by danazol was greater than that achieved with MPA. Danazol also had more biochemical effects than MPA. It interfered with the functions of the liver and the production of thrombocytes and haemoglobin, whereas MPA affected only albumin synthesis/release.
Output for Example 1: Danazol is effective in treating the symptoms and signs of endometriosis. However, its use is limited by the occurrence of androgenic side effects.

Example 2:
Reference Text 1: Title: Four-week nicotine skin patch treatment effects on cognitive performance in Alzheimer's disease. Abstract: Acute nicotine injections have been found to improve attentional performance in patients with Alzheimer's disease (AD), but little is known about chronic nicotine effects. The present study was undertaken to evaluate the clinical and neuropsychological effects of chronic transdermal nicotine in Alzheimer's disease subjects over a 4-week period. The double-blind, placebo controlled, cross-over study consisted of two 4-week periods separated by a 2-week washout period. Patients wore the nicotine patch (Nicotrol) for 16 h a day at the following doses: 5 mg/day during week 1, 10 mg/day during weeks 2 and 3 and 5 mg/day during week 4. The eight subjects had mild to moderate AD and were otherwise healthy. Nicotine significantly improved attentional performance as measured by the Conners' continuous performance test (CPT). There was a significant reduction in errors of omission on the CPT which continued throughout the period of chronic nicotine administration. The variability of hit reaction time (reaction time for correct responses) on the CPT was also significantly reduced by chronic nicotine. Nicotine did not improve performance on other tests measuring motor and memory function. The sustained improvement in attention found in this study with nicotine dermal patches is encouraging. However, the lack of detected effects of nicotine treatment on other cognitive and behavioral domains in this study leaves questions concerning the clinical impact of nicotinic treatment in Alzheimer's disease. The modest size of this study limited statistical power which may have been needed to detect more subtle but clinically significant cognitive effects. Higher doses of nicotine, other nicotinic ligands or combination treatment of nicotine with other therapies may be efficacious for producing broader therapeutic effects.
Output for Example 2: This review is not able to provide any evidence that nicotine is or is not a useful treatment for Alzheimer's disease.

Example 3:
Reference Text 1: Title: Brief psychoeducational parenting program: an evaluation and 1-year follow-up. Abstract: Despite recognition of the need for parenting interventions to prevent childhood behavioral problems, few community programs have been evaluated. This report describes the randomized controlled evaluation of a four-session psychoeducational group for parents of preschoolers with behavior problems, delivered in community agencies. In 1998, 222 primary caregivers, recruited through community ads, filled out questionnaires on parenting practices and child behavior. Parents were randomly assigned to immediate intervention or a wait-list control. The intervention comprised three weekly group sessions and a 1-month booster, the focus being to support effective discipline (using the video 1-2-3 Magic) and to reduce parent-child conflict. Using an intent-to-treat analysis, repeated-measures analyses of variance indicated that the parents who received the intervention reported significantly greater improvement in parenting practices and a significantly greater reduction in child problem behavior than the control group. The gains in positive parenting behaviors were maintained at 1-year follow-up in a subset of the experimental group. This brief intervention program may be a useful first intervention for parents of young children with behavior problems, as it seems both acceptable and reasonably effective.
Reference Text 2: Title: The efficacy of parent training for promoting positive parent-toddler relationships. Abstract: The effectiveness of a parent training program for promoting positive parent-child relationships was examined among families of 2-year-olds. Forty-six mothers and fathers and their toddlers were assigned to either an intervention or comparison group. Intervention group parents participated in a 10-week program that focused on principles for effectively interacting with their toddlers. Parents completed measures of parenting self-efficacy, depression, stress, and perceptions of their toddler's behaviors and were videotaped playing with their toddlers preintervention, postintervention, and 3 months following the intervention. Repeated measures ANOVAs showed that the parent training program led to significant increases in maternal self-efficacy, decreases in maternal stress, and improvements in the quality of mother-toddler interactions. No significant effects were found among fathers. Explanations for obtaining different outcomes for mothers and fathers are discussed and directions for future research are recommended.
Reference Text 3: Title: Evaluating a brief parental-education program for parents of young children. Abstract: The effectiveness of a brief parental-education program for 40 families with very young children was studied. Families were assigned to either a parental-education or waiting-list control group. The parental-education program included information and strategies drawn from developmental and cognitive psychology and social learning theory. Analysis showed that participating parents significantly reduced their use of corporal and verbal punishment, changed their parenting attitudes, and improved their perceptions of their children's behavior in comparison to the control group. Effects were maintained at six weeks follow-up. Results supported tailoring parental-education programs to the unique needs of participants."
Reference Text 4: Title: Parent training of toddlers in day care in low-income urban communities. Abstract: The authors tested a 12-week parent training program with parents (n = 208) and teachers (n = 77) of 2-3-year-olds in day care centers serving low-income families of color in Chicago. Eleven centers were randomly assigned to 1 of 4 conditions: (a) parent and teacher training (PT + TT), (b) parent training (PT), (c) teacher training (TT), and (d) waiting list control (C). After controlling for parent stress, PT and PT + TT parents reported higher self-efficacy and less coercive discipline and were observed to have more positive behaviors than C and TT parents. Among toddlers in high-risk behavior problem groups, toddlers in the experimental conditions showed greater improvement than controls. Most effects were retained 1 year later. Benefits were greatest when parents directly received training."
Reference Text 5: Title: Parenting intervention in Sure Start services for children at risk of developing conduct disorder: pragmatic randomised controlled trial. Abstract: To evaluate the effectiveness of a parenting programme as a preventive intervention with parents of preschool children considered to be at risk of developing conduct disorder. Pragmatic randomised controlled trial using a block design with allocation by area. Eleven Sure Start areas in north and mid-Wales. 153 parents from socially disadvantaged areas, with children aged 36-59 months at risk of conduct disorder defined by scoring over the clinical cut off on the Eyberg child behaviour inventory. Participants were randomised on a 2:1 basis, 104 to intervention and 49 to remaining on the wait listing (control). Twenty (13%) were lost to follow-up six months later, 18 from the intervention group. The Webster-Stratton Incredible Years basic parenting programme, a 12 week group based intervention. Problem behaviour in children and parenting skills assessed by self reports from parents and by direct observation in the home. Parents' self reported parenting competence, stress, and depression. Standardised and well validated instruments were used throughout. At follow-up, most of the measures of parenting and problem behaviour in children showed significant improvement in the intervention group. The intention to treat analysis for the primary outcome measure, the Eyberg child behaviour inventory, showed a mean difference between groups of 4.4 points (95% confidence interval 2.0 to 6.9, P<0.001) on the problem scale with an effect size of 0.63, and a mean difference of 25.1 (14.9 to 35.2, P<0.001) on the intensity scale with an effect size of 0.89. This community based study showed the effectiveness of an evidence based parenting intervention delivered with fidelity by regular Sure Start staff. It has influenced policy within Wales and provides lessons for England where, to date, Sure Start programmes have not been effective. ISRCTN46984318.
Output for Example 3: "The findings of this review provide some support for the use of group-based parenting programmes to improve the emotional and behavioural adjustment of children with a maximum mean age of three years eleven months. There is, insufficient evidence to reach firm conclusions regarding the role that such programmes might play in the primary prevention of such problems. There are also limited data available concerning the long-term effectiveness of these programmes. Further research is needed."
'''
        
    elif prompt_type == '5shot':
        prompt += f'''
### Examples
Example 1:
Reference Text 1: Title: Placebo-controlled comparison of danazol and high-dose medroxyprogesterone acetate in the treatment of endometriosis after conservative surgery. Abstract: To evaluate the clinical value of postoperative hormone therapy in endometriosis, 60 patients with advanced disease were randomized to receive in a double-blind study danazol (200 mg, 3 times daily), medroxyprogesterone acetate (MPA) (100 mg daily) or placebo post-operatively for 6 months. Treatment efficacy was evaluated clinically and at laparoscopy 6 months after medication. In relation to placebo, danazol and high-dose MPA treatments, which did not differ from each other in efficacy, significantly alleviated pelvic pain. In addition, the peritoneal endometriosis lesions found at 6-months laparoscopy were significantly smaller in the MPA and danazol groups than in the placebo group. Breakthrough bleeding, weight gain and acne complicated danazol treatment but only breakthrough bleeding complicated MPA treatment. These data suggest that postoperative treatment of advanced endometriosis with high-dose MPA or danazol is clinically beneficial.
Reference Text 2: Title: Placebo-controlled comparison of danazol and high-dose medroxyprogesterone acetate in the treatment of endometriosis. Abstract: A prospective, double-blind, placebo-controlled study was designed to evaluate the clinical efficacy and tolerance of danazol and high-dose medroxyprogesterone acetate (MPA) in the treatment of mild-moderate endometriosis. After laparoscopical confirmation of endometriosis, 59 patients were randomized to receive danazol (200 mg 3 times daily), MPA (100 mg daily) or placebo for 6 months. Clinical examinations were done before and 1, 3, 6 and 12 months after the beginning of the study, and a 2nd laparoscopy 6 months after termination of the medication. Eighteen patients in the danazol group, 16 in the MPA group and 17 in the placebo group completed the trial. Total or partial resolution of peritoneal implants was observed in 60% of the patients receiving danazol and in 63% of the patients receiving MPA. In the placebo group, resolution was observed in 18%, while the size of the implants was estimated to be increased in 23% of the patients. In relation to placebo, danazol and MPA significantly alleviated endometriosis-associated pelvic pain, lower back pain and defecation pain, but they did not differ from each other in these actions. The appearance of acne, muscle cramps, edema, weight gain and spotting bleeding complicated MPA treatment. The present results indicate that because of good efficacy and tolerance, high-dose MPA is a useful alternative in the hormonal treatment of endometriosis.
Reference Text 3: Title: Placebo-controlled comparison of hormonal and biochemical effects of danazol and high-dose medroxyprogesterone acetate. Abstract: The hormonal and biochemical effects of danazol (600 mg a day) and high-dose medroxyprogesterone acetate (MPA; 100 mg a day) were studied in a placebo-controlled, 6-month trial. Serum gonadotrophins and prolactin levels did not change during danazol and MPA treatments, whereas oestradiol and progesterone levels decreased significantly in relation to placebo without any difference between danazol and MPA. Both drugs significantly suppressed the sex hormone-binding globulin level (SHBG), and consequently, the free-androgen index (serum total testosterone nmol/l per SHBG nmol/l x 100) as compared with placebo, the effect of danazol being significantly stronger than that of MPA. Danazol, but not MPA, significantly increased serum aspartate aminotransferase (ASAT), alanine aminotransferase (ALAT) and haemoglobin levels, and also thrombocyte counts, whereas MPA, but not danazol, increased the serum concentration of albumin in relation to placebo. Serum total bilirubin, conjugated bilirubin, gamma-glutamyl transferase, creatinine, alkaline phosphatase, sodium and potassium levels and leucocyte counts remained unchanged during both treatments. Danazol and high-dose MPA did not differ from each other in their ovarian and anterior pituitary effects, while the increase in androgenic activity induced by danazol was greater than that achieved with MPA. Danazol also had more biochemical effects than MPA. It interfered with the functions of the liver and the production of thrombocytes and haemoglobin, whereas MPA affected only albumin synthesis/release.
Output for Example 1: Danazol is effective in treating the symptoms and signs of endometriosis. However, its use is limited by the occurrence of androgenic side effects.

Example 2:
Reference Text 1: Title: Four-week nicotine skin patch treatment effects on cognitive performance in Alzheimer's disease. Abstract: Acute nicotine injections have been found to improve attentional performance in patients with Alzheimer's disease (AD), but little is known about chronic nicotine effects. The present study was undertaken to evaluate the clinical and neuropsychological effects of chronic transdermal nicotine in Alzheimer's disease subjects over a 4-week period. The double-blind, placebo controlled, cross-over study consisted of two 4-week periods separated by a 2-week washout period. Patients wore the nicotine patch (Nicotrol) for 16 h a day at the following doses: 5 mg/day during week 1, 10 mg/day during weeks 2 and 3 and 5 mg/day during week 4. The eight subjects had mild to moderate AD and were otherwise healthy. Nicotine significantly improved attentional performance as measured by the Conners' continuous performance test (CPT). There was a significant reduction in errors of omission on the CPT which continued throughout the period of chronic nicotine administration. The variability of hit reaction time (reaction time for correct responses) on the CPT was also significantly reduced by chronic nicotine. Nicotine did not improve performance on other tests measuring motor and memory function. The sustained improvement in attention found in this study with nicotine dermal patches is encouraging. However, the lack of detected effects of nicotine treatment on other cognitive and behavioral domains in this study leaves questions concerning the clinical impact of nicotinic treatment in Alzheimer's disease. The modest size of this study limited statistical power which may have been needed to detect more subtle but clinically significant cognitive effects. Higher doses of nicotine, other nicotinic ligands or combination treatment of nicotine with other therapies may be efficacious for producing broader therapeutic effects.
Output for Example 2: This review is not able to provide any evidence that nicotine is or is not a useful treatment for Alzheimer's disease.

Example 3:
Reference Text 1: Title: Brief psychoeducational parenting program: an evaluation and 1-year follow-up. Abstract: Despite recognition of the need for parenting interventions to prevent childhood behavioral problems, few community programs have been evaluated. This report describes the randomized controlled evaluation of a four-session psychoeducational group for parents of preschoolers with behavior problems, delivered in community agencies. In 1998, 222 primary caregivers, recruited through community ads, filled out questionnaires on parenting practices and child behavior. Parents were randomly assigned to immediate intervention or a wait-list control. The intervention comprised three weekly group sessions and a 1-month booster, the focus being to support effective discipline (using the video 1-2-3 Magic) and to reduce parent-child conflict. Using an intent-to-treat analysis, repeated-measures analyses of variance indicated that the parents who received the intervention reported significantly greater improvement in parenting practices and a significantly greater reduction in child problem behavior than the control group. The gains in positive parenting behaviors were maintained at 1-year follow-up in a subset of the experimental group. This brief intervention program may be a useful first intervention for parents of young children with behavior problems, as it seems both acceptable and reasonably effective.
Reference Text 2: Title: The efficacy of parent training for promoting positive parent-toddler relationships. Abstract: The effectiveness of a parent training program for promoting positive parent-child relationships was examined among families of 2-year-olds. Forty-six mothers and fathers and their toddlers were assigned to either an intervention or comparison group. Intervention group parents participated in a 10-week program that focused on principles for effectively interacting with their toddlers. Parents completed measures of parenting self-efficacy, depression, stress, and perceptions of their toddler's behaviors and were videotaped playing with their toddlers preintervention, postintervention, and 3 months following the intervention. Repeated measures ANOVAs showed that the parent training program led to significant increases in maternal self-efficacy, decreases in maternal stress, and improvements in the quality of mother-toddler interactions. No significant effects were found among fathers. Explanations for obtaining different outcomes for mothers and fathers are discussed and directions for future research are recommended.
Reference Text 3: Title: Evaluating a brief parental-education program for parents of young children. Abstract: The effectiveness of a brief parental-education program for 40 families with very young children was studied. Families were assigned to either a parental-education or waiting-list control group. The parental-education program included information and strategies drawn from developmental and cognitive psychology and social learning theory. Analysis showed that participating parents significantly reduced their use of corporal and verbal punishment, changed their parenting attitudes, and improved their perceptions of their children's behavior in comparison to the control group. Effects were maintained at six weeks follow-up. Results supported tailoring parental-education programs to the unique needs of participants."
Reference Text 4: Title: Parent training of toddlers in day care in low-income urban communities. Abstract: The authors tested a 12-week parent training program with parents (n = 208) and teachers (n = 77) of 2-3-year-olds in day care centers serving low-income families of color in Chicago. Eleven centers were randomly assigned to 1 of 4 conditions: (a) parent and teacher training (PT + TT), (b) parent training (PT), (c) teacher training (TT), and (d) waiting list control (C). After controlling for parent stress, PT and PT + TT parents reported higher self-efficacy and less coercive discipline and were observed to have more positive behaviors than C and TT parents. Among toddlers in high-risk behavior problem groups, toddlers in the experimental conditions showed greater improvement than controls. Most effects were retained 1 year later. Benefits were greatest when parents directly received training."
Reference Text 5: Title: Parenting intervention in Sure Start services for children at risk of developing conduct disorder: pragmatic randomised controlled trial. Abstract: To evaluate the effectiveness of a parenting programme as a preventive intervention with parents of preschool children considered to be at risk of developing conduct disorder. Pragmatic randomised controlled trial using a block design with allocation by area. Eleven Sure Start areas in north and mid-Wales. 153 parents from socially disadvantaged areas, with children aged 36-59 months at risk of conduct disorder defined by scoring over the clinical cut off on the Eyberg child behaviour inventory. Participants were randomised on a 2:1 basis, 104 to intervention and 49 to remaining on the wait listing (control). Twenty (13%) were lost to follow-up six months later, 18 from the intervention group. The Webster-Stratton Incredible Years basic parenting programme, a 12 week group based intervention. Problem behaviour in children and parenting skills assessed by self reports from parents and by direct observation in the home. Parents' self reported parenting competence, stress, and depression. Standardised and well validated instruments were used throughout. At follow-up, most of the measures of parenting and problem behaviour in children showed significant improvement in the intervention group. The intention to treat analysis for the primary outcome measure, the Eyberg child behaviour inventory, showed a mean difference between groups of 4.4 points (95% confidence interval 2.0 to 6.9, P<0.001) on the problem scale with an effect size of 0.63, and a mean difference of 25.1 (14.9 to 35.2, P<0.001) on the intensity scale with an effect size of 0.89. This community based study showed the effectiveness of an evidence based parenting intervention delivered with fidelity by regular Sure Start staff. It has influenced policy within Wales and provides lessons for England where, to date, Sure Start programmes have not been effective. ISRCTN46984318.
Output for Example 3: "The findings of this review provide some support for the use of group-based parenting programmes to improve the emotional and behavioural adjustment of children with a maximum mean age of three years eleven months. There is, insufficient evidence to reach firm conclusions regarding the role that such programmes might play in the primary prevention of such problems. There are also limited data available concerning the long-term effectiveness of these programmes. Further research is needed."

Example 4:
Reference Text 1: Title: Effectiveness of adenotonsillectomy in PFAPA syndrome: a randomized study. Abstract: To evaluate whether adenotonsillectomy leads to complete resolution in children with PFAPA (periodic fever, aphthous stomatitis, pharyngitis, and cervical adenitis) syndrome. Thirty-nine children with PFAPA syndrome were randomized to either adenotonsillectomy (surgery group; n = 19) or expectant management (control group; n = 20). All patients were then invited prospectively to record all PFAPA episodes, and were evaluated clinically every 3 months for 18 months after randomization. The proportion of patients experiencing complete resolution was 63% in the surgery group and 5% in the control group (P < .001). The mean (+/- standard deviation) number of episodes recorded during the study period was 0.7 +/- 1.2 in the surgery group and 8.1 +/- 3.9 in the control group (P < .001). The episodes were less severe in the surgery group. Adenotonsillectomy is an effective treatment strategy for children with PFAPA syndrome.
Reference Text 2: Title: "A randomized, controlled trial of tonsillectomy in periodic fever, aphthous stomatitis, pharyngitis, and adenitis syndrome." Abstract: We carried out a prospective, randomized, controlled trial to clarify the effect of tonsillectomy on the clinical course of periodic fever, aphthous stomatitis, pharyngitis, and adenitis (PFAPA) syndrome.                Twenty-six consecutive children (mean age 4.1 years) with at least 5 PFAPA attacks were recruited from 3 tertiary care pediatric hospitals during 1999-2003 and randomly allocated to tonsillectomy or follow-up alone. They were all followed up with symptom diaries for 12 months. Tonsillectomy was allowed after 6 months in the control group if the attacks recurred. Six months after randomization all 14 children in the tonsillectomy group and 6/12 children in the control group (50%) were free of symptoms (difference 50%, 95% confidence interval 23% to 75%, P < .001). Tonsillectomy was performed on 5/6 of the patients in the control group who still had symptoms after 6 months. The remaining unoperated child in the control group had recurrences of the fever episodes throughout the follow-up, but the symptoms became less severe, and the parents did not choose tonsillectomy. Tonsillectomy appeared to be effective for treating PFAPA syndrome. The fever episodes ceased without any intervention in half of the control subjects. We conclude that although the mechanisms behind this syndrome are unknown, tonsillectomy can be offered as an effective intervention for children with PFAPA.
Output for Example 4: "The trials included in this review reported follow up at 18 and six months respectively but it is well-established that children with PFAPA recover spontaneously and treatment can be administered to try and reduce the severity of individual episodes. Therefore, the parents and carers of children with PFAPA must weigh the risks and consequences of surgery (hospitalisation, a predictable period of time postoperatively away from school/nursery, the risks of surgery) against the alternative of a finite period of recurrent episodes of disease at predictable intervals, potentially requiring time off school and the regular use of medication. It is uncertain whether adenoidectomy combined with tonsillectomy adds any additional benefit to tonsillectomy alone."

Example 5:
Reference Text 1: Title: Inspiratory muscle training improves lung function and exercise capacity in adults with cystic fibrosis. Abstract: To investigate the effects of high-intensity inspiratory muscle training (IMT) on inspiratory muscle function (IMF), diaphragm thickness, lung function, physical work capacity (PWC), and psychosocial status in patients with cystic fibrosis (CF). Twenty-nine adult patients with CF were randomly assigned to three groups. Two groups were required to complete an 8-week program of IMT in which the training intensity was set at either 80% of maximal effort (group 1; 9 patients) or 20% of maximal effort (group 2; 10 patients). A third group of patients did not participate in any form of training and acted as a control group (group 3; 10 patients). In all patients, baseline and postintervention measures of IMF were determined by maximal inspiratory pressure (Pimax), and sustained Pimax (SPimax); pulmonary function, body composition, and physical activity status were also determined. In addition, diaphragm thickness was measured at functional residual capacity (FRC) and total lung capacity (TLC) [TDIcont], and the diaphragm thickening ratio (TR) was calculated (TR = thickness during Pimax at FRC/mean thickness at FRC). Subjects also completed an incremental cycle ergometer test to exhaustion and two symptom-related questionnaires, prior to and following training. Following training, significant increases in Pimax and SPimax (p < 0.05), TDIcont (p < 0.05), TR (p < 0.05), vital capacity (p < 0.05), TLC (p < 0.05), and PWC (p < 0.05) were identified, and decreases in anxiety scores (p < 0.05) and depression scores (p < 0.01) were noted in group 1 patients compared to group 3 patients. Group 2 patients significantly improved Pimax and SPimax (both p < 0.05) only with respect to group 3 patients. No significant differences were observed in group 3 patients. An 8-week program of high-intensity IMT resulted in significant benefits for CF patients, which included increased IMF and thickness of the diaphragm (during contraction), improved lung volumes, increased PWC, and improved psychosocial status."
Reference Text 2: Title: The effects of inspiratory muscle training in patients with cystic fibrosis. Abstract: This study evaluated whether inspiratory muscle training (IMT) could increase inspiratory muscle strength and endurance and exercise performance in 11 patients with cystic fibrosis (CF) with moderately severe airflow limitation. The IMT consisted of breathing through an inspiratory resistance (IR) for 15 min twice daily for 4 wk, preceded or followed by a 4-wk control period. After IMT, there was an increase in inspiratory muscle strength measured by maximal inspiratory mouth pressure at functional residual capacity: 74 +/- 18 cmH2O before training to 81 +/- 12 cmH2O after training, mean +/- 1 SD, p less than 0.025, and in inspiratory muscle endurance measured by the maximal IR tolerated for 10 min. Limitation of performance in the progressive exercise test was related to increased airflow limitation and possibly to poor nutrition. Mean exercise performance during progressive or submaximal exercise testing did not change after training. It is concluded that this form of IMT improved inspiratory muscle strength and endurance, but had little effect on exercise performance in patients with CF."
Reference Text 3: Title: Inspiratory muscle training in patients with cystic fibrosis. Abstract: Little information is available about the effects of inspiratory muscle training in patients with cystic fibrosis (CF). In this study the effects of inspiratory-threshold loading in patients with CF on strength and endurance of the inspiratory muscles, pulmonary function, exercise capacity, dyspnoea and fatigue were evaluated. Sixteen patients were assigned to one of two groups using the minimization method: eight patients in the training group and eight patients in the control group. The training was performed using an inspiratory-threshold loading device. Patients were instructed to use the threshold trainer 20 min a day, 5 days a week for 6 weeks. Patients in the training group trained at inspiratory threshold loads up to 40% of maximal static inspiratory pressure (Pimax) and patients in the control group got 'sham' training at a load of 10% of Pimax. No significant differences were found among the two groups in gender, age, weight, height, pulmonary function, exercise capacity, inspiratory-muscle strength and inspiratory-muscle endurance before starting the training programme. Mean (SD) age in the control group was 19 (5.5) years, mean (SD) age in the training group was 17 (5.2) years. Mean FEV1 in both groups was 70% predicted, mean inspiratory-muscle strength in both groups was above 100% predicted. All patients except one, assigned to the training group, completed the programme. After 6 weeks of training, mean inspiratory-muscle endurance (% Pimax) in the control group increased from 50% to 54% (P = 0.197); in the training group mean inspiratory muscle endurance (% Pimax) increased from 49% to 66% (P = 0.003). Statistical analysis showed that the change in inspiratory-muscle endurance (% Pimax) in the training group was significantly higher than in the control group (P = 0.012). After training, in the training group there was a tendency of improvement in Pimax with an increase from 105 to 123% predicted, which just fell short of statistical significance (P = 0.064). After training no significant differences were found in changes from baseline in pulmonary function, exercise capacity, dyspnoea and fatigue. It is concluded that low-intensity inspiratory-threshold loading at 40% of Pimax was sufficient to elicit an increased inspiratory-muscle endurance in patients with CF."
Reference Text 4: Title: Improved pulmonary function and exercise tolerance with inspiratory muscle conditioning in children with cystic fibrosis. Abstract: This study documented the effect of inspiratory muscle conditioning in children with cystic fibrosis. Subjects, ages 7 to 14 years, were divided into two groups. The experimental group (n = 10) trained at a high pressure load (> or = 29 cm H2O) and the control group (n = 10) trained at a minimal pressure load (< or = 15 cm H2O), using a threshold loading device. Subjects trained 30 min a day for 10 weeks. Pulmonary function, inspiratory muscle strength, and exercise tolerance were measured at the beginning and end of the training period. Pulmonary function was measured by body plethysmography. Inspiratory muscle strength was determined by standard measures of maximal inspiratory pressure against an occluded airway. Exercise tolerance was measured by the length of time subjects could walk on a treadmill. Findings indicated that the experimental group showed significant increases in inspiratory muscle strength, vital capacity, total lung capacity, and exercise tolerance in comparison to the control group."
Output for Example 5: We have not found any evidence to suggest that this treatment is either beneficial or not. We would advise that practitioners evaluate on a case-by-case basis whether or not to employ this therapy. We recommend that future studies make more use of health-related quality of life and exercise tolerance measures; and that there is an agreement upon a single standard measure of classifying the clinical status of the participants.
'''

    # Check if the lengths of titles and abstracts match
    if len(titles) != len(abstracts):
        raise ValueError("The number of titles and abstracts must match.")
    
    # Loop through each title and abstract, appending them to the prompt
    for i, (title, abstract) in enumerate(zip(titles, abstracts), start=1):
        prompt += f'''
### Reference Text {i} 
Title: {title}
Abstract: {abstract}
'''
    
    # Append the final part of the prompt that asks for the output
    prompt += '''
### Output Text: 
'''

    return prompt

In [None]:
import json

def run(GPT, prompt_type, reviewids, titles, abstracts):
    output_path = f'./output/RCTsum/GPT-{GPT}/'
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Check if the file exists and delete it before starting to append new outputs
    output_file_path = os.path.join(output_path, f'test_{prompt_type}.json')
    # if os.path.exists(output_file_path):
    #     os.remove(output_file_path)
    
    for reviewid, title, abstract in zip(reviewids, titles, abstracts): 
        success = False
        while not success:
            try:
                prompt = create_prompt(title, abstract, prompt_type)
                output = get_output(prompt, GPT)

                # todo: model may not generate proper answer??
                cleaned_output = ' '.join(output.split())

                # Create a json string
                output_dict = {"ReviewID": reviewid, "Answer": cleaned_output}
                json_string = json.dumps(output_dict)
                
                # Open the file in append mode ('a') to add each new output
                with open(output_file_path, 'a', encoding='utf-8') as f_write:
                    f_write.write(json_string + '\n')
                    success = True
                    
            except Exception as e:
                print(e)

In [None]:
GPT = 'gemini'
prompt_type = '3shot'

run(GPT, prompt_type, \
    test_combined_df['ReviewID'].tolist(), 
    test_combined_df['Title'].tolist(), 
    test_combined_df['Abstract'].tolist())

### Evaluation (overlapping with golden labels)

In [None]:
import json

GPT = 'gemini'
prompt_type = 'base'
output_file_path = f'./human evaluation results/RCTsum/GPT-{GPT}/test_{prompt_type}.json'

test_id2_list = []
test_answer_list = []

# Open the JSON file and read line by line
with open(output_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        sample = json.loads(line.strip())
        test_id2_list.append(sample.get('ReviewID'))
        test_answer_list.append(sample.get('Answer'))

assert test_combined_df['ReviewID'].tolist() == test_id2_list

test_target_list = test_combined_df['Target'].tolist()

In [None]:
# print(test_target_list[1])
# print(test_answer_list[1])

In [None]:
from rouge import Rouge

# Initialize the Rouge scoring object
rouge = Rouge()

# Prepare lists to hold scores for each metric
scores_rouge1 = []
scores_rouge2 = []
scores_rougel = []

# Iterate over each pair and calculate ROUGE scores
for gold_standard, predicted_output in zip(test_target_list, test_answer_list):
    # Calculate scores
    scores = rouge.get_scores(predicted_output, gold_standard, avg=False)
    
    # Append scores for each metric
    scores_rouge1.append(scores[0]['rouge-1']['f'])
    scores_rouge2.append(scores[0]['rouge-2']['f'])
    scores_rougel.append(scores[0]['rouge-l']['f'])

# Calculate average scores
avg_rouge1 = sum(scores_rouge1) / len(scores_rouge1)
avg_rouge2 = sum(scores_rouge2) / len(scores_rouge2)
avg_rougel = sum(scores_rougel) / len(scores_rougel)

print(f"Average ROUGE-1 Score: {avg_rouge1}")
print(f"Average ROUGE-2 Score: {avg_rouge2}")
print(f"Average ROUGE-L Score: {avg_rougel}")


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

import numpy as np

In [None]:
bleu_score_list = []
meteor_sc_list = []

for label_text, predicted_text in zip(test_target_list, test_answer_list):
    # Calculate BLEU score
    reference = [label_text.split()]
    candidate = predicted_text.split()
    bleu_score = sentence_bleu(reference, candidate, weights=[0.5, 0.5])

    # Tokenize the predicted text
    tokenized_predicted_text = word_tokenize(predicted_text)
    tokenized_label_text = word_tokenize(label_text)

    # Calculate METEOR score
    meteor_sc = meteor_score([tokenized_label_text], tokenized_predicted_text)

    bleu_score_list.append(bleu_score)
    meteor_sc_list.append(meteor_sc)


print(f"Average BLEU Score: {np.mean(np.array(bleu_score_list))}")
print(f"Average METEOR Score: {np.mean(np.array(meteor_sc_list))}")
