In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
df = pd.DataFrame(columns=['username', 'rum', 'rating'])

In [3]:
base_url = "https://distiller.com/search?official_status=official&page="
review_urls = []

In [4]:
from tqdm import tqdm

for page in tqdm(range(1, 300)):
    url = base_url + str(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    rums = soup.find('ol', class_='spirits').find_all('li', class_='spirit sugarcane-content')
    
    for rum in rums:
        link = rum.find('a')['href']
        review_urls.append("https://distiller.com" + link)


100%|██████████| 299/299 [24:29<00:00,  4.92s/it] 


In [5]:
import warnings

warnings.filterwarnings("ignore")

In [7]:
for url in tqdm(review_urls):
    rum_name = url.split('/')[-1]
    url = url + "/tastes?page="
    page_number = 1
    while True:
        review_url = url + str(page_number)
        response = requests.get(review_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        reviews = soup.find_all('div', class_='taste-content')
        
        if not reviews:
            break 
        
        for review in reviews:
            try:
                username = review.find('div', class_='name-details').find('a').find('h3').text.strip()
                rating = review.find('div', class_='rating-display__value').text.strip()

                # print(rum_name, username, rating)
                
                df = df.append({'username': username, 'rum': rum_name, 'rating': rating}, ignore_index=True)
            except:
                continue
        
        page_number += 1


100%|██████████| 230/230 [42:15<00:00, 11.02s/it]  


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['usernameID'] = encoder.fit_transform(df['username'])

df['rumID'] = encoder.fit_transform(df['rum'])

In [None]:
df.to_csv('reviews.csv', index=False)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11893 entries, 0 to 11892
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   username    11893 non-null  object
 1   rum         11893 non-null  object
 2   rating      11893 non-null  object
 3   usernameID  11893 non-null  int64 
 4   rumID       11893 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 464.7+ KB


In [None]:
df = pd.read_csv('reviews.csv')
rum_counts = df.groupby('usernameID')['rumID'].count()
usernames_to_delete = rum_counts[rum_counts < 10].index
df = df[~df['usernameID'].isin(usernames_to_delete)]
df = df.reset_index(drop=True)
df['usernameID'] = df['usernameID'].rank(method='dense').astype(int)
df['rumID'] = df['rumID'].rank(method='dense').astype(int)
df = df.sort_values(['usernameID', 'rumID'])
df.to_csv('reviews_final.csv', index=False)

In [8]:
df

Unnamed: 0,username,rum,rating
0,worldwhiskies95,capovilla-liberation-pmg-rhum,5.0
1,EpicureMTL,capovilla-liberation-pmg-rhum,5.0
2,idetrinidad,capovilla-liberation-pmg-rhum,4.0
3,Gronqvist,capovilla-liberation-pmg-rhum,4.0
4,Eric-Mertens,capovilla-liberation-pmg-rhum,4.0
...,...,...,...
12242,eric24broncos,selvarey-cacao,4.0
12243,taylorchambers,selvarey-cacao,4.75
12244,JackieChen,selvarey-cacao,4.0
12245,whiskey_whisky_,selvarey-cacao,4.0


In [None]:
import pandas as pd

# Group the DataFrame by the user and count the number of items liked by each user
user_reaction_count = df.groupby('usernameID')['rumID'].count().reset_index(name='num_reviews')

# Sort the DataFrame based on the count of items liked by each user in descending order
sorted_df = user_reaction_count.sort_values(by='num_reviews', ascending=False)


In [None]:
sorted_df.head(500)

Unnamed: 0,usernameID,num_reviews
979,979,50
3602,3602,41
657,657,38
1607,1607,37
4291,4291,36
...,...,...
4615,4615,4
1731,1731,4
1678,1678,4
714,714,4


In [None]:

# Optionally, retrieve all items liked by each user
liked_items_by_user = df.groupby('username')['rum'].apply(list).reset_index(name='liked_items')


In [None]:
liked_items_by_user

Unnamed: 0,username,liked_items
0,---------1359,[ron-zacapa-centenario-sistema-solera-23-rum]
1,--------1946,"[chairman-s-reserve-the-forgotten-casks-rum, b..."
2,--------374,"[ron-diplomatico-reserva-exclusiva-rum, ron-za..."
3,--------476,[arcane-extraroma-amber-rum]
4,-------537,[ron-diplomatico-reserva-exclusiva-rum]
...,...,...
5661,zombijosue,[ron-zacapa-centenario-sistema-solera-23-rum]
5662,zvanwink,"[ron-diplomatico-reserva-exclusiva-rum, papa-s..."
5663,zwdorworth,[ron-diplomatico-reserva-exclusiva-rum]
5664,zweird,"[ron-diplomatico-reserva-exclusiva-rum, planta..."
