In [84]:
import pandas as pd
from collections import Counter
import re
import nltk 
from nltk.corpus import stopwords
from predictionguard import PredictionGuard
import os
import itertools

In [10]:
# Input File Name
inputfile = 'translated_messages.csv'

# Translated Column Messages
messagecolumn = 'translated_messages'

# Import Data
data = pd.read_csv(inputfile)
data = data.drop(['Rating', 'Review'], axis = 1)

In [58]:
# Join All Messages
all_messages = ' '.join(data[messagecolumn].tolist())

# Find All Words
words = re.findall(r'\b\w+\b', all_messages.lower())

# Remove Stop Words
stop_words = set(stopwords.words('english'))
filtered = [word for word in words if word not in stop_words]

# Count
counts = Counter(filtered)
countsdf = pd.DataFrame(counts.items(), columns = ['words', 'frequency'])


In [67]:
# Set API key
api_key = os.getenv("PREDICTIONGUARD_API_KEY", "Oq62vYfSJRwjnFQcUnJy5PM3SRVejYtJCXWSxnfv")

# Initialize the PredictionGuard client
client = PredictionGuard(api_key=api_key)

# System Behavior
system_message = {
    "role": "system",
    "content": (
        "You are a beer enthusiast. Your task is to look through words and pick which of them you'd consider to be an attribute of beer. \n"
        "Do not provide any explanations or contextual information. Only return the word if it's an attribute of beer."
    )
}

# Only consider words that are present more than 100 times
countsdf_100 = countsdf[countsdf['frequency'] > 100]

def process_message(row):
    try:
        user_message = row['word']
        print(user_message)
        # Prepare the messages list for the chatbot
        messages = [
            system_message,
            {
                "role": "user",
                "content": f"{user_message}"
            }
        ]
        
        # Send the message to the PredictionGuard API
        result = client.chat.completions.create(
            model="Hermes-3-Llama-3.1-8B",
            messages=messages
        )
        
        # Extract the chatbot's response
        response = result['choices'][0]['message']['content']
        print(result)
        return response
    except Exception as e:
        return f"Error: {str(e)}"
    
response = []
response = countsdf_100.apply(lambda row: process_message(row), axis=1)




In [None]:
# Set API key
api_key = os.getenv("PREDICTIONGUARD_API_KEY", "Oq62vYfSJRwjnFQcUnJy5PM3SRVejYtJCXWSxnfv")

# Check if the API key is being set correctly
print(f"API Key: {api_key}")

# Initialize the PredictionGuard client
client = PredictionGuard(api_key=api_key)

# System Behavior
system_message = {
    "role": "system",
    "content": (
        "You are a beer enthusiast. Your task is to look through words and pick which of them you'd consider to be an attribute of beer. \n"
        "Do not provide any explanations or contextual information. Only return the word if it's an attribute of beer."
    )
}

# Only consider words that are present more than 100 times
countsdf_100 = countsdf[countsdf['frequency'] > 100]

# Define the list to store words that are considered attributes
attributes = []

def process_message(row):
    try:
        user_message = row['words']

        # Prepare the messages list for the chatbot
        messages = [
            system_message,
            {
                "role": "user",
                "content": f"{user_message}"
            }
        ]

        # Send the message to the PredictionGuard API
        result = client.chat.completions.create(
            model="Hermes-3-Llama-3.1-8B",
            messages=messages
        )

        # Extract the chatbot's response
        response = result['choices'][0]['message']['content'].strip().lower()

        # Check if the response is 'yes' and add to attributes list
        if response == 'yes':
            attributes.append(user_message)
        
        return response
    except Exception as e:
        print(f"Error processing word {user_message}: {str(e)}")
        return f"Error: {str(e)}"
    
# Apply the process_message function to each row
countsdf_100['response'] = countsdf_100.apply(lambda row: process_message(row), axis=1)

# Debug print the list of attributes
print("Attributes considered as beer-related:", attributes)


In [82]:
# Top 20 Attributes
attributes_20 = attributes[:20]

In [98]:
# Lift Analysis of Attributes

# Initializations
attribute_counter = Counter()
co_occurrence_counter = Counter()
lift_results = []
total_messages = len(data)

# Function to find attributes in a message
def find_attributes(message, attribute_set):
    words = re.findall(r'\w+', message.lower())

    # Filter the replaced words for the list
    filtered_attributes = set([word for word in words if word not in stop_words and word in attribute_set])

    return filtered_attributes

# Functino to find co-occurences 
def find_co_occurrences(message, attributes_20, distance):
    words = message.split()
    found_attributes = []
    
    for i, word in enumerate(words):
        if word in attributes_20:
            found_attributes.append((word, i)) 
    
    co_occurrences = set()
    for (attribute1, idx1), (attribute2, idx2) in itertools.combinations(found_attributes, 2):
        if abs(idx1 - idx2) <= distance: 
            co_occurrences.add(tuple(sorted((attribute1, attribute2)))) 
    return co_occurrences

def calculate_lift(attribute1, attribute2, attribute_counter, co_occurrence_counter, total_messages):
    P_A = attribute_counter[attribute1] / total_messages 
    P_B = attribute_counter[attribute2] / total_messages  
    
    # Combine counts for both (brand1, brand2) and (brand2, brand1)
    P_AB = (co_occurrence_counter[(attribute1, attribute2)] + co_occurrence_counter[(attribute2, attribute1)]) / total_messages #if (brand1, brand2) in co_occurrence_counter or (brand2, brand1) in co_occurrence_counter else 0
    
    if P_A * P_B == 0: 
        return 0
    return P_AB / (P_A * P_B)

# Loop through all messages to update counters
for message in data[messagecolumn]:
    filtered_attributes = find_attributes(message, attributes_20)
    
    # Update brand counter with the filtered brands
    attribute_counter.update(filtered_attributes)
    
    # Now find co-occurrences using the replaced message
    co_occurrences = find_co_occurrences(message, attributes_20, distance=10e6)
    co_occurrence_counter.update(co_occurrences)

# Calculate lifts
for (attribute1, attribute2) in itertools.combinations(attributes_20, 2):
    lift = calculate_lift(attribute1, attribute2, attribute_counter, co_occurrence_counter, total_messages)
    lift_results.append((attribute1, attribute2, lift))

# Create Lift Dataframe
lift_df = pd.DataFrame(lift_results, columns=['Attribute1', 'Attribute2', 'Lift'])

# Create Lift Matrix
lift_matrix = lift_df.pivot(index='Attribute1', columns='Attribute2', values='Lift')
lift_matrix = lift_matrix.combine_first(lift_matrix.T)
lift_matrix.fillna(0, inplace=True)

# Print Lift Matrix
print(lift_matrix)

              aroma   balance  balanced    barrel      beer    bodied  \
aroma      0.000000  0.354258  0.236642  0.290704  0.264807  0.151313   
balance    0.354258  0.000000  0.554760  0.719619  0.573867  0.367685   
balanced   0.236642  0.554760  0.000000  0.523259  0.497568  0.296343   
barrel     0.290704  0.719619  0.523259  0.000000  0.609843  0.243495   
beer       0.264807  0.573867  0.497568  0.609843  0.000000  0.208151   
bodied     0.151313  0.367685  0.296343  0.243495  0.208151  0.000000   
bottle     0.121661  0.255030  0.215697  0.303513  0.238001  0.123220   
brown      0.317886  0.586688  0.475355  0.710690  0.483532  0.341422   
caramel    0.140644  0.336322  0.309968  0.398645  0.295979  0.152764   
colour     0.234351  0.271367  0.307819  0.237529  0.265351  0.280255   
dark       0.331042  0.587629  0.505983  0.762764  0.493112  0.346348   
flavours   0.189706  0.507986  0.460978  0.666964  0.357642  0.116583   
malt       0.206353  0.544063  0.395360  0.511566  