In [1]:
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import requests

import re

import numpy as np

from scipy.stats import norm

from itertools import combinations

from scipy.stats import ttest_ind

from bs4 import BeautifulSoup

pd.set_option('display.max_rows', 50) # выведем больше строк

pd.set_option('display.max_columns', 50) # выведем больше колонок

sns.set_style('darkgrid')

In [2]:
df = pd.read_csv('main_task.csv')

In [3]:
def cat_col_describe(column):
    """Функция для описания числовых столбцов"""
    
    counts = df[column].value_counts(dropna=False)
    percent = df[column].value_counts(dropna=False, normalize=True)
    percent100 = df[column].value_counts(dropna=False, normalize=True).mul(100).round(1).astype(str) + '%'
    display(pd.DataFrame({'counts': counts, 'per': percent, 'per100': percent100}))
    
    print(df[column].describe())

    print('')
    print(f'Всего {len(df[column].value_counts())} значений в столбце', column,\
          f', {len(df[column][df[column].isna()])} пропусков')
    
def cont_col_describe(column):
    """Функция для описания текстовых столбцов"""
    
    counts = df[column].value_counts(dropna=False)
    percent = df[column].value_counts(dropna=False, normalize=True)
    percent100 = df[column].value_counts(dropna=False, normalize=True).mul(100).round(1).astype(str) + '%'
    display(pd.DataFrame({'counts': counts, 'per': percent, 'per100': percent100}))
    
    print(df[column].describe())

    sns.distplot(df[column], bins = len(df[column].value_counts()), fit=norm, kde=False)
    
    print('')
    print(f'Всего {len(df[column].value_counts())} значений в столбце', column,\
          f', {len(df[column][df[column].isna()])} пропусков')

def get_boxplot(column):
    """Функция для построения боксплотов"""
    fig, ax = plt.subplots(figsize = (14, 4))
    sns.boxplot(x=column, y='Rating', 
                data=df.loc[df.loc[:, column].isin(df.loc[:, column].value_counts().index[:20])],
               ax=ax)
    plt.xticks(rotation=45)    
    ax.set_title('Boxplot for ' + column)
    plt.show()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        26114 non-null  object 
 6   Number of Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.1+ MB


In [5]:
df.head()

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        26114 non-null  object 
 6   Number of Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.1+ MB


In [7]:
url = 'https://www.tripadvisor.ru/Restaurant_Review-g187147-d809100-Reviews-Au_rendez_vous_de_la_Marine-Paris_Ile_de_France.html'
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")

In [8]:
# Количество фотографий
photos = soup.find('span', {"class": "details"}).text
photos = photos.split()[2]
photos = re.sub(r"\(", "", photos)
photos = re.sub(r"\)", "", photos)
photos = int(photos)
photos

108

In [9]:
# Количество ресторанов
restaurants = soup.find('a', {"class": "restaurants-detail-top-info-TopInfo__infoCellLink--2ZRPG"}).text
restaurants = re.sub("\xa0", "", restaurants)
restaurants = restaurants.split()[2]
restaurants = int(restaurants)
restaurants

15239

In [10]:
# Количество слов в названии
words_in_name = soup.title.text.split(',')[0]
words_in_name = int(len(words_in_name.split()))
words_in_name

5

In [11]:
# убираем дубликаты
df = df.drop_duplicates(subset='ID_TA').reset_index()

In [12]:
df['Cuisine Style'].fillna('Other')

0                  ['European', 'French', 'International']
1                                                    Other
2        ['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...
3                                                    Other
4        ['German', 'Central European', 'Vegetarian Fri...
                               ...                        
39975    ['Italian', 'Vegetarian Friendly', 'Vegan Opti...
39976    ['French', 'American', 'Bar', 'European', 'Veg...
39977                                ['Japanese', 'Sushi']
39978    ['Polish', 'European', 'Eastern European', 'Ce...
39979                                          ['Spanish']
Name: Cuisine Style, Length: 39980, dtype: object

In [13]:

df['Cuisine Style'].fillna('Other')

0                  ['European', 'French', 'International']
1                                                    Other
2        ['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...
3                                                    Other
4        ['German', 'Central European', 'Vegetarian Fri...
                               ...                        
39975    ['Italian', 'Vegetarian Friendly', 'Vegan Opti...
39976    ['French', 'American', 'Bar', 'European', 'Veg...
39977                                ['Japanese', 'Sushi']
39978    ['Polish', 'European', 'Eastern European', 'Ce...
39979                                          ['Spanish']
Name: Cuisine Style, Length: 39980, dtype: object

In [14]:
# Превращаем Cuisine Style в список
cuisines = re.sub("\[", "", df['Cuisine Style'][0])
cuisines = re.sub("\]", "", cuisines)
cuisines = cuisines.split("delimiter")

print(cuisines, type(cuisines), sep='\n')

["'European', 'French', 'International'"]
<class 'list'>


In [15]:
# Превращаем Reviews в 2 списка
reviews_and_dates = df['Reviews'][0]
reviews_and_dates = reviews_and_dates.split("], [")
reviews = reviews_and_dates[0]
review_dates = reviews_and_dates[1]

reviews = re.sub("\[", "", reviews)
reviews = re.sub("\]", "", reviews)
reviews = re.sub("\'", "", reviews)
reviews = re.sub(",", "", reviews)
#reviews = reviews.split("delimiter")

review_dates = re.sub("\[", "", review_dates)
review_dates = re.sub("\]", "", review_dates)
review_dates = review_dates.split("delimiter")

print(reviews, type(reviews), sep='\n')
print(review_dates, type(review_dates), sep='\n')

Good food at your doorstep A good hotel restaurant
<class 'str'>
["'12/31/2017', '11/20/2017'"]
<class 'list'>


In [17]:
import itertools
restaurant_name_tokens = pd.Series(
    list(itertools.chain(reviews.split(" ")))
)

In [21]:
import nltk
nltk.download('stopwords')
from tqdm import tqdm
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ennoya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [23]:
top_tokens = restaurant_name_tokens.value_counts()
top_tokens = top_tokens.iloc[
    np.argwhere(top_tokens.index.map(lambda t: str(t).lower() not in stopwords).values).flatten()
]
top_tokens = top_tokens.drop(['-', '&'])

top_tokens.head(20).plot.bar(figsize=(14, 5), fontsize=20)
sns.despine()

KeyError: "['-' '&'] not found in axis"