Import packages

In [None]:
!pip install openai
import openai
import pandas as pd
import os
import re
from tenacity import retry, wait_random_exponential, stop_after_attempt
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# GPT-3.5 evaluation

### Load data and get overview

In [3]:
# mount to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# load the test set data
df = pd.read_csv('Handin/Data/df_test_set_sample.csv')

In [5]:
print(df.head())

                                             reviews  sentiment
0  an amazing find this apartment was located in ...          1
1  our group of stayed at ics house for a weekend...          1
2  we enjoyed our stay at   apartment it really i...          1
3     was wonderful super tidy and a very welcomi...          1
4  perfect it was a lovely stay henrique was very...          1


In [6]:
print(df.shape)

(1000, 2)


In [33]:
# Directory to save the csv after each iteration, slow but secure
DIR_PATH = "/content/drive/MyDrive/NLP/Notebooks/Old"
CSV_FILE = os.path.join(DIR_PATH, 'gpt_responses.csv')

# Function to get the last saved index since the API is no stable
def get_last_saved_index():
    if os.path.isfile(CSV_FILE):
        df = pd.read_csv(CSV_FILE)
        return df.shape[0]
    return 0

# Function to process reviews
def process_with_retry():
    openai.api_key = "FILL IN AN VALID OPENAI KEY"

    # Get the index of the last saved file
    last_index = get_last_saved_index()
    if last_index > 0:
        df_gpt_response = pd.read_csv(CSV_FILE)
    else:
        df_gpt_response = pd.DataFrame(columns=['review_comment', 'predicted_rating', 'actual_rating'])

    # Iterate over each review comment in sample_df with a progress bar
    for i, (comment, actual_rating) in enumerate(tqdm(zip(df['reviews'][last_index:], df['sentiment'][last_index:]), desc="Processing reviews", unit="review")):
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=f"""Given the following Airbnb review, predict the reviewer's rating as 1 or 0,
            where 1 is positive and 0 is the negative. Provide your answer as only an integer. 
            Here is the review: '{comment}' """,
            temperature=0,
            max_tokens=60,
            logprobs=10,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=0.0
        )

        try:
            # Extract the predicted rating from the response
            predicted_rating = int(response.choices[0].text.strip())

            # Append the review comment, predicted rating, and actual rating to the df_gpt_response DataFrame
            df_gpt_response = df_gpt_response.append({'review_comment': comment, 'predicted_rating': predicted_rating, 'actual_rating': actual_rating},
                                                     ignore_index=True)
            
            # Save DataFrame every iteration
            df_gpt_response.to_csv(CSV_FILE, index=False)
                
        except ValueError:
            # Handle cases where the predicted rating cannot be converted to an integer
            print(f"Error: Invalid response for comment - '{comment}'")
    
    return df_gpt_response

# Call the process_with_retry function and assign the result to a variable
result_df = process_with_retry()

# Access the df_gpt_response DataFrame
print(result_df)


  df_gpt_response = df_gpt_response.append({'review_comment': comment, 'predicted_rating': predicted_rating, 'actual_rating': actual_rating},
  df_gpt_response = df_gpt_response.append({'review_comment': comment, 'predicted_rating': predicted_rating, 'actual_rating': actual_rating},
  df_gpt_response = df_gpt_response.append({'review_comment': comment, 'predicted_rating': predicted_rating, 'actual_rating': actual_rating},
  df_gpt_response = df_gpt_response.append({'review_comment': comment, 'predicted_rating': predicted_rating, 'actual_rating': actual_rating},
  df_gpt_response = df_gpt_response.append({'review_comment': comment, 'predicted_rating': predicted_rating, 'actual_rating': actual_rating},
  df_gpt_response = df_gpt_response.append({'review_comment': comment, 'predicted_rating': predicted_rating, 'actual_rating': actual_rating},
Processing reviews: 6review [00:04,  1.36review/s]

                                        review_comment  predicted_rating  \
0    an amazing find this apartment was located in ...                 1   
1    our group of stayed at ics house for a weekend...                 1   
2    we enjoyed our stay at   apartment it really i...                 1   
3       was wonderful super tidy and a very welcomi...                 1   
4    perfect it was a lovely stay henrique was very...                 1   
..                                                 ...               ...   
995  if you re looking for somewhere close to the a...                 0   
996  first the positives quiet convenient neighborh...                 0   
997  dear all i will not recommend at all you to be...                 0   
998  horrendous cancellation policy i cancelled jus...                 0   
999  had the bad experience of finding out the plac...                 0   

     actual_rating  
0                1  
1                1  
2                1  
3  




### Get evaluation metrics



In [36]:
from sklearn.metrics import accuracy_score, classification_report

# Read the DataFrame
df_gpt_response = pd.read_csv("/Handin/Data/gpt_responses.csv")

# Extract the predicted and actual ratings
predicted_ratings = df_gpt_response['predicted_rating']
actual_ratings = df_gpt_response['actual_rating']

# Calculate the overall accuracy
accuracy = accuracy_score(actual_ratings, predicted_ratings)
print(f"Overall Accuracy: {accuracy}")

# Calculate precision, recall and F1-score for each category
classification_metrics = classification_report(actual_ratings, predicted_ratings, digits = 5)
print(classification_metrics)


Overall Accuracy: 0.988
              precision    recall  f1-score   support

           0    0.75000   0.90000   0.81818        30
           1    0.99689   0.99072   0.99380       970

    accuracy                        0.98800      1000
   macro avg    0.87344   0.94536   0.90599      1000
weighted avg    0.98948   0.98800   0.98853      1000



### Get probabilities of the predicted tokens

In [14]:
openai.api_key = "FILL IN AN VALID OPENAI KEY"
# Make API request
FP = openai.Completion.create(
  model="text-davinci-003",
  prompt="""Given the following Airbnb review, predict the reviewer's rating as 1 or 0,
            where 1 is positive and 0 is the negative. Provide your answer as only an integer. 
            Here is the review: 'The flat was good, host was bad and location great' """,
  temperature=0,
  max_tokens=60,
  logprobs=10,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=0.0
)

import math
# Remove spaces and all non numeric characters
def clean_number(x):
    x = re.sub("[^0-9]", "", x)
    return x

token_probas = FP["choices"][0]["logprobs"]["top_logprobs"]
probabilities = {str(k):0 for k in range(0,2)}

for token_prob in token_probas:
    for keys, probas in token_prob.items():
        num = clean_number(keys)
        if num in ["0","1"]:
            probabilities[num] += math.exp(probas)

# Normalize
total = sum(probabilities.values())
probabilities = {k: v / total for k, v in probabilities.items()}

print(probabilities)

{'0': 0.3112905724848001, '1': 0.6887094275151998}
