In [None]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
import re

from tqdm import tqdm
tqdm.pandas()

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

data = pd.read_csv(r'data.csv')
df = pd.DataFrame(data)

In [None]:
df.head(3)
df.tail(3)
df.shape
df.columns

In [None]:
df.describe

In [None]:
df.info()

In [None]:
df.columns

In [None]:
# check unique cols in df
for col in df.columns:
    if df[col].is_unique:
        print(f'Unique Column : {col} ')

In [None]:
# reset indexes
df = df.reset_index()

Null values

In [None]:
# check null counts
df.isnull().sum()

In [None]:
# drop null values
df = df.dropna()
df.isnull().sum()

In [None]:
df.shape

Remove duplicate data

In [None]:
data.review_profileName.head()

In [None]:
# sort by "review_overall" in descending order
df = df.sort_values('review_overall', ascending=False)

# keep the highest rating from each "review_profilename" and drop the rest
df = df.drop_duplicates(subset= ['review_profileName','beer_beerId'], keep='first')
df.shape

1. Rank top 3 Breweries which produce the strongest beers?

In [None]:

# group by brewerId and calculate the average ABV for each brewery
brewery_avg_abv = df.groupby('beer_brewerId')['beer_ABV'].mean()

# sort breweries by average ABV in descending order and select the top 3
top_3_breweries = brewery_avg_abv.sort_values(ascending=False).head(3)

print("Top 3 Breweries Producing the Strongest Beers:")
print(top_3_breweries)


2. Which year did beers enjoy the highest ratings? 

In [None]:
# convert review_time to datetime
df['review_time'] = pd.to_datetime(df['review_time'], unit='s')

# extract year from review_time
df['year'] = df['review_time'].dt.year

# group by year and calculate the average rating for each year
average_ratings_by_year = df.groupby('year')['review_overall'].mean()

# find the year with the highest average rating
highest_rated_year = average_ratings_by_year.idxmax()

print("Year with the highest average ratings for beers:", highest_rated_year)

3. Based on the user’s ratings which factors are important among taste, aroma, appearance, and palette?

In [None]:
# Calculate correlation matrix
correlation_matrix = df[['review_taste', 'review_aroma', 'review_appearance', 'review_palette', 'review_overall']].corr()

# Extract correlations with review_overall
correlations_with_overall = correlation_matrix['review_overall'].drop('review_overall')

# Sort correlations in descending order
sorted_correlations = correlations_with_overall.sort_values(ascending=False)

print("Correlation between each factor and overall review rating:")
print(sorted_correlations)

so review_aroma has highest corellation which is important

4. If you were to recommend 3 beers to your friends based on this data which ones will you recommend? * need to edit

In [None]:
# assigning custom weights
weights = {'review_overall': 0.4, 'review_taste': 0.2, 'review_aroma': 0.1, 'review_appearance': 0.1, 'review_palette': 0.2}
df['weighted_rating'] = (df[list(weights.keys())] * pd.Series(weights)).sum(axis=1)

# sort beers by weighted rating in descending order
recommended_beers = df.sort_values(by='weighted_rating', ascending=False).head(3)

print("Recommended beers for my friends:")
#print(recommended_beers[['beer_name', 'weighted_rating', 'review_text']])
recommended_beers[['beer_name', 'weighted_rating', 'review_text']].head(3)

how the weights were calculated:

Review Overall: This factor represents the overall review rating given by users. Since it reflects the overall satisfaction with the beer, it was assigned the highest weight of 0.4.
Review Taste: Taste is a crucial aspect of beer enjoyment, so it was assigned a weight of 0.2, reflecting its importance in the overall rating.
Review Aroma: Aroma contributes significantly to the sensory experience of drinking beer, but it may be slightly less important than taste. Therefore, it was assigned a weight of 0.1.
Review Appearance: While appearance can influence the initial impression of a beer, its impact on overall enjoyment may be somewhat lower compared to taste and aroma. Hence, it was assigned a weight of 0.1.
Review Palette: Palette, which likely refers to the mouthfeel or texture of the beer, was also considered important but slightly less so compared to taste and aroma. Therefore, it was assigned a weight of 0.2.

5. Which Beer style seems to be the favorite based on reviews written by users? 

In [None]:
# taking relevant columns
reviewTextData = data[['beer_beerId','beer_name','beer_ABV','beer_style','review_overall','review_text']]

# taking higher ranked reviews only >/=4 (from the overall reviews column)
reviewTextData = reviewTextData.loc[reviewTextData['review_overall'] >= 4]

# resetting Index
reviewTextData.reset_index(drop=True,inplace=True)

reviewTextData.head()

In [None]:
reviewTextData.review_text[0]

In [None]:
# text preprocessing
import re

# initial text processing replacing short forms
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"it\'s", "it is", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase

In [None]:
# extracting text reviews and applying text preprocessing on it
preprocessed_reviews = []

for sentance in tqdm(reviewTextData['review_text'].values): # tqdm prints the status bar
    sentance = decontracted(sentance) # deconstructiong short forms
    sentance = re.sub("\S*\d\S*", "", sentance).strip() # remove words with numbers 
    
    preprocessed_reviews.append(sentance) # form sentence again

In [None]:
preprocessed_reviews[0]

In [None]:
# appending preprocessed reviews to the filtered dataframe
reviewTextData['preprocessed_review_text'] = preprocessed_reviews

In [None]:
# instantiating Sentiment Analyzer
sianalyzer = SentimentIntensityAnalyzer()

# loop over the 'preprocessed_review_text' column and calculate the polarity score for each review
reviewTextData['polarity_score2'] = reviewTextData['preprocessed_review_text'].progress_apply(lambda x: sianalyzer.polarity_scores(x)['compound'])

In [None]:
# grouping and calculate mean polarity score.
reviewTextDataGroupped = reviewTextData.groupby('beer_style')['polarity_score2'].mean()

# sort the grouped data by mean polarity score
reviewTextDataGroupped.sort_values(ascending=False)[0:5]

In [None]:
# observing the top 'polarity_score2' and 'beer_beerId' associated with i
reviewTextData.loc[reviewTextData['beer_style'] == 'Dortmunder / Export Lager']
reviewTextData.loc[reviewTextData['beer_style'] == 'American Blonde Ale']

By observing the mean compound polarity score , we can say that the beer style "Dortmunder / Export Lager" is liked most but has only one person that likes it as much, we can instead say "American Blonde Ale" is the most famous, based on combination of polarity and higher frequency

6. How does written review compare to overall review score for the beer styles?

By observing the mean compound polarity score calculated we can get an idea how the user written review text is collaborating in calculating the overall review score.

7. How to find similar beer drinkers by using written reviews only?   

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
reviewTextData.columns

In [None]:
# Feature Extraction
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text data to create TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(reviewTextData['preprocessed_review_text'])

# Step 3: Similarity Calculation
# Calculate cosine similarity between user reviews
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
from sklearn.cluster import KMeans
import numpy as np

In [None]:
# grouping together similiar customers based on reviews
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(cosine_similarities)

# Step 3: Analyze cluster assignments
# Assign each user to a cluster
user_clusters = {}

for user_id, cluster_id in enumerate(clusters):
    if cluster_id not in user_clusters:
        user_clusters[cluster_id] = []
    user_clusters[cluster_id].append(user_id)

# Print the users in each cluster
for cluster_id, users in user_clusters.items():
    print(f"Cluster {cluster_id}: {users}")