# Crawling the dataset

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

In [2]:
url = "https://distiller.com/search?official_status=official&page="
review_urls = []

session = requests.Session()

for page in tqdm(range(1, 86)):
    page_url = url + str(page)
    resp = session.get(page_url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    
    # classes of the item to scrape
    item_classes = ['spirit whiskey-content', 'spirit sugarcane-content']
    items = soup.find('ol', class_='spirits') \
                .find_all('li', class_=item_classes)

    review_urls.extend(["https://distiller.com" + item.find('a')['href'] for item in items])


100%|██████████| 85/85 [06:02<00:00,  4.27s/it]


In [3]:
usernames, items, ratings = [], [], []

for url in tqdm(review_urls):
    item = url.split('/')[-1]
    page = 1
    while True:
        review_url = url + "/tastes?page=" + str(page)
        resp = requests.get(review_url)
        soup = BeautifulSoup(resp.text, 'html.parser')
        reviews = soup.find_all('div', class_='taste-content')

        if not reviews:
            break

        for review in reviews:
            try:
                username = review.find('div', class_='name-details') \
                             .find('a') \
                             .find('h3') \
                             .text \
                             .strip()
                             
                usernames.append(username)
                items.append(item)

                rating = review.find('div', class_='rating-display__value') \
                               .text \
                               .strip()
                               
                ratings.append(rating)

            except:
                continue

        page += 1


100%|██████████| 581/581 [8:02:41<00:00, 49.85s/it]     


In [7]:
df = pd.DataFrame({'username': usernames, 'item': items, 'rating': ratings})


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119224 entries, 0 to 119223
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   username  119224 non-null  object
 1   item      119224 non-null  object
 2   rating    119224 non-null  object
dtypes: object(3)
memory usage: 2.7+ MB


## Add columns with user and item ids

In [9]:
encoder = LabelEncoder()
df['user_id'] = encoder.fit_transform(df['username'])
df['item_id'] = encoder.fit_transform(df['item'])

df.head()

Unnamed: 0,username,item,rating,user_id,item_id
0,whiskyBoyWheatRidge,hibiki-21-year,3.5,45397,245
1,jsowers,hibiki-21-year,5.0,35271,245
2,freakinbluechair,hibiki-21-year,3.75,31836,245
3,Richard-Davenport,hibiki-21-year,5.0,19375,245
4,winstona,hibiki-21-year,4.0,45564,245


In [10]:
df.to_csv('reviews.csv', index=False)

## Filter the dataset and export the final version

In [11]:
rum_counts = df.groupby('user_id')['item_id'].count()
usernames_to_delete = rum_counts[rum_counts < 10].index

final_df = df[~df['user_id'].isin(usernames_to_delete)]
final_df = final_df.reset_index(drop=True)

final_df['user_id'] = final_df['user_id'].rank(method='dense').astype(int)
final_df['item_id'] = final_df['item_id'].rank(method='dense').astype(int)

final_df = final_df.sort_values(['user_id', 'item_id'])

final_df.to_csv('reviews_final.csv', index=False)

In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31783 entries, 27184 to 3714
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   username  31783 non-null  object
 1   item      31783 non-null  object
 2   rating    31783 non-null  object
 3   user_id   31783 non-null  int64 
 4   item_id   31783 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.5+ MB


In [13]:
final_df

Unnamed: 0,username,item,rating,user_id,item_id
27184,00PJ00,ardbeg-corryvreckan,5.0,1,19
27901,00PJ00,ardbeg-uigeadail,4.0,1,22
23516,00PJ00,benromach-peat-smoke-2005,4.0,1,55
30281,00PJ00,elijah-craig-12,4.25,1,155
23001,00PJ00,gooderham-worts-four-grain-canadian-whisky,4.75,1,217
...,...,...,...,...,...
10186,zx636jb,knob-creek-limited-edition-2001,5.0,1713,275
10190,zx636jb,knob-creek-limited-edition-2001,4.25,1713,275
24386,zx636jb,w-l-weller-12,5.0,1713,484
1136,zx636jb,william-larue-weller-bourbon-fall-2015,5.0,1713,502
