In [1]:
#imports & load data
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('0.translated_test.csv')

#drop spam reviews
df = data[data['translated messages'] != 'SPAM']

#clean message col; make lowercase, remove special chars + numbers
df['cleaned messages'] = df['translated messages'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x.lower()) if isinstance(x, str) else '')
df.head()

Unnamed: 0,Beer Name,URL,Rating,Review,rating,message,translated messages,cleaned messages
1,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,https://www.ratebeer.com/beer/toppling-goliath...,,,4.5,"Bottle after MBCC 2024. Black colour, malty ar...","Bottle after MBCC 2024. Black colour, malty ar...",bottle after mbcc black colour malty aroma wi...
2,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,https://www.ratebeer.com/beer/toppling-goliath...,,,4.3,Thank you for sharing this Chris - Black with ...,This was real good,this was real good
3,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,https://www.ratebeer.com/beer/toppling-goliath...,,,4.7,"Boxed beer at home, proper glassware. Pitch bl...","Boxed beer at home, proper glassware. Pitch bl...",boxed beer at home proper glassware pitch blac...
4,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,https://www.ratebeer.com/beer/toppling-goliath...,,,4.9,"From backlog. (As 2018 Vintage) 0,3 litre Bott...","From backlog. (As 2018 Vintage) 0,3 litre Bott...",from backlog as vintage litre bottle from a ...
5,Närke Kaggen Stormaktsporter🇸🇪Stout - Imperial,https://www.ratebeer.com/beer/naerke-kaggen-st...,,,4.2,Borefts 2024. A black beer with a beige lacing...,Borefts 2024. A black beer with a beige lacing...,borefts a black beer with a beige lacing arom...


In [2]:
#word freq analysis
#countVectorizer to get word freq counts
vectorizer = CountVectorizer(stop_words='english')

#fit + transform cleaned messages to get term-document matrix
X = vectorizer.fit_transform(df['cleaned messages'])

#sum of counts for each word
word_counts = X.sum(axis=0).tolist()[0]
word_freq = dict(zip(vectorizer.get_feature_names_out(), word_counts))

#top 10
top_words = Counter(word_freq).most_common(10)
print('Top 10 Words by Frequency:')
top_words

Top 10 Words by Frequency:


[('black', 9),
 ('dark', 8),
 ('taste', 8),
 ('beer', 7),
 ('bottle', 7),
 ('chocolate', 7),
 ('aroma', 6),
 ('bourbon', 6),
 ('coffee', 6),
 ('honey', 6)]

In [3]:
#select top 5 most freq mentioned words as attributes
top_attributes = [word for word, count in Counter(word_freq).most_common(5)]
print('Top 5 Attributes:')
top_attributes #obviously we'll have to look through the real file to get true attributes instead of random top words

Top 5 Attributes:


['black', 'dark', 'taste', 'beer', 'bottle']

In [4]:
#lift analysis

#get cleaned messages + attributes from word freq
input_file = df['cleaned messages'].apply(lambda x: x.split() if isinstance(x, str) else [])
all_items = top_attributes

#init counters
total_posts = len(input_file)
attribute_counts = {item: 0 for item in all_items}
co_occurrences = {pair: 0 for pair in combinations(all_items, 2)}

#count occurrences + co-occurrences
for tokens in input_file:
    if isinstance(tokens, list):  #checks that tokens are in a list
        #individual occurrences
        unique_tokens = set(tokens)  #avoid double counting in same post
        for token in unique_tokens:
            if token in attribute_counts:
                attribute_counts[token] += 1
                
        #co-occurrences for attribute pairs
        for attr_a, attr_b in combinations(top_attributes, 2):
            if attr_a in unique_tokens and attr_b in unique_tokens:
                co_occurrences[(attr_a, attr_b)] += 1

#init lift matrix df
lift_matrix = pd.DataFrame(0.0, index=all_items, columns=all_items)

#calc lift for each pair
for pair, co_count in co_occurrences.items():
    attr_a, attr_b = pair
    p_a = attribute_counts[attr_a] / total_posts
    p_b = attribute_counts[attr_b] / total_posts
    p_a_and_b = co_count / total_posts

    if p_a > 0 and p_b > 0: #avoid division by zero
        lift = p_a_and_b / (p_a * p_b)
        if lift_matrix.index.get_loc(attr_a) < lift_matrix.index.get_loc(attr_b):
            lift_matrix.at[attr_a, attr_b] = lift

lift_matrix = lift_matrix.fillna(0.0)

#actual lift matrix
print('Lift Matrix:')
lift_matrix

Lift Matrix:


Unnamed: 0,black,dark,taste,beer,bottle
black,0.0,0.9,1.285714,1.5,1.071429
dark,0.0,0.0,1.371429,1.2,1.371429
taste,0.0,0.0,0.0,1.714286,1.22449
beer,0.0,0.0,0.0,0.0,0.857143
bottle,0.0,0.0,0.0,0.0,0.0


In [5]:
#in case you wanted to see the lift values dict
{pair: (co_occurrences[pair] / total_posts) / ((attribute_counts[pair[0]] / total_posts) * (attribute_counts[pair[1]] / total_posts)) for pair in co_occurrences}

{('black', 'dark'): 0.8999999999999999,
 ('black', 'taste'): 1.2857142857142856,
 ('black', 'beer'): 1.5,
 ('black', 'bottle'): 1.0714285714285714,
 ('dark', 'taste'): 1.3714285714285712,
 ('dark', 'beer'): 1.2,
 ('dark', 'bottle'): 1.3714285714285712,
 ('taste', 'beer'): 1.7142857142857142,
 ('taste', 'bottle'): 1.2244897959183672,
 ('beer', 'bottle'): 0.8571428571428571}