In [1]:
import numpy as np
import pandas as pd
import re

# Visualization
from termcolor import colored
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
import plotly
import plotly.express as px

# Import geo services
import geocoder
import geopy
from geopy import distance
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

# Загружаем специальный удобный инструмент для разделения датасета:
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk("/Data"):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

RANDOM_SEED = 42  # фиксируем RANDOM_SEED для воспроизводимости



In [None]:
!pip freeze > requirements.txt  # фиксируем версию пакетов

In [64]:
import chardet.universaldetector
detector = chardet.UniversalDetector()
with open(DATA_DIR+"/hotels_train.csv", "rb") as fh:
    for line in fh:
        detector.feed(line)
        if detector.done:
            break
detector.close()

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}

In [2]:
# Loading data

DATA_DIR = "Data/"
df_train = pd.read_csv(DATA_DIR+"/hotels_train.csv", encoding="ascii") # датасет для обучения
df_test = pd.read_csv(DATA_DIR+"hotels_test.csv", encoding="ascii") # датасет для предсказания
sample_submission = pd.read_csv(DATA_DIR+"/submission.csv", encoding="ascii") # самбмишн

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               386803 non-null  object 
 1   additional_number_of_scoring                386803 non-null  int64  
 2   review_date                                 386803 non-null  object 
 3   average_score                               386803 non-null  float64
 4   hotel_name                                  386803 non-null  object 
 5   reviewer_nationality                        386803 non-null  object 
 6   negative_review                             386803 non-null  object 
 7   review_total_negative_word_counts           386803 non-null  int64  
 8   total_number_of_reviews                     386803 non-null  int64  
 9   positive_review                             386803 non-null  object 
 

In [4]:
df_train.head(2)

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,4,7,10.0,"[' Leisure trip ', ' Couple ', ' Studio Suite ...",531 day,51.507894,-0.143671
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,2,14,6.3,"[' Business trip ', ' Couple ', ' Standard Dou...",203 day,51.521009,-0.123097


In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128935 entries, 0 to 128934
Data columns (total 16 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               128935 non-null  object 
 1   additional_number_of_scoring                128935 non-null  int64  
 2   review_date                                 128935 non-null  object 
 3   average_score                               128935 non-null  float64
 4   hotel_name                                  128935 non-null  object 
 5   reviewer_nationality                        128935 non-null  object 
 6   negative_review                             128935 non-null  object 
 7   review_total_negative_word_counts           128935 non-null  int64  
 8   total_number_of_reviews                     128935 non-null  int64  
 9   positive_review                             128935 non-null  object 
 

In [6]:
df_test.head(2)

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,tags,days_since_review,lat,lng
0,Via Senigallia 6 20161 Milan Italy,904,7/21/2017,8.1,Hotel Da Vinci,United Kingdom,Would have appreciated a shop in the hotel th...,52,16670,Hotel was great clean friendly staff free bre...,62,1,"[' Leisure trip ', ' Couple ', ' Double Room '...",13 days,45.533137,9.171102
1,Arlandaweg 10 Westpoort 1043 EW Amsterdam Neth...,612,12/12/2016,8.6,Urban Lodge Hotel,Belgium,No tissue paper box was present at the room,10,5018,No Positive,0,7,"[' Leisure trip ', ' Group ', ' Triple Room ',...",234 day,52.385649,4.834443


In [7]:
sample_submission.head(2)

Unnamed: 0,reviewer_score,id
0,1,488440
1,10,274649


In [8]:
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128935 entries, 0 to 128934
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   reviewer_score  128935 non-null  int64
 1   id              128935 non-null  int64
dtypes: int64(2)
memory usage: 2.0 MB


In [9]:
# Join train and test data for feature engineering

df_train["sample"] = 1 # train mark
df_test["sample"] = 0 # test mark
df_test["reviewer_score"] = 0 # we have to predict 'reviewer_score', so now it would be filled with 0 in test data

data = pd.concat([df_train, df_test], sort=False).reset_index(drop=True) # join

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515738 entries, 0 to 515737
Data columns (total 18 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               515738 non-null  object 
 1   additional_number_of_scoring                515738 non-null  int64  
 2   review_date                                 515738 non-null  object 
 3   average_score                               515738 non-null  float64
 4   hotel_name                                  515738 non-null  object 
 5   reviewer_nationality                        515738 non-null  object 
 6   negative_review                             515738 non-null  object 
 7   review_total_negative_word_counts           515738 non-null  int64  
 8   total_number_of_reviews                     515738 non-null  int64  
 9   positive_review                             515738 non-null  object 
 

In [11]:
data.describe()

Unnamed: 0,additional_number_of_scoring,average_score,review_total_negative_word_counts,total_number_of_reviews,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,lat,lng,sample
count,515738.0,515738.0,515738.0,515738.0,515738.0,515738.0,515738.0,512470.0,512470.0,515738.0
mean,498.081836,8.397487,18.53945,2743.743944,17.776458,7.166001,6.297672,49.442439,2.823803,0.749999
std,500.538467,0.548048,29.690831,2317.464868,21.804185,11.040228,3.902295,3.466325,4.579425,0.433014
min,1.0,5.2,0.0,43.0,0.0,1.0,0.0,41.328376,-0.369758,0.0
25%,169.0,8.1,2.0,1161.0,5.0,1.0,0.625,48.214662,-0.143372,0.25
50%,341.0,8.4,9.0,2134.0,11.0,3.0,7.9,51.499981,0.010607,1.0
75%,660.0,8.8,23.0,3613.0,22.0,8.0,9.6,51.516288,4.834443,1.0
max,2682.0,9.8,408.0,16670.0,395.0,355.0,10.0,52.400181,16.429233,1.0


In [12]:
data.describe(include="object")

Unnamed: 0,hotel_address,review_date,hotel_name,reviewer_nationality,negative_review,positive_review,tags,days_since_review
count,515738,515738,515738,515738,515738,515738,515738,515738
unique,1493,731,1492,227,330011,412601,55242,731
top,163 Marsh Wall Docklands Tower Hamlets London ...,8/2/2017,Britannia International Hotel Canary Wharf,United Kingdom,No Negative,No Positive,"[' Leisure trip ', ' Couple ', ' Double Room '...",1 days
freq,4789,2585,4789,245246,127890,35946,5101,2585


In [156]:
data[(data["lat"].isna()) | (data["lng"].isna())]

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng,sample
122,Savoyenstra e 2 16 Ottakring 1160 Vienna Austria,86,8/26/2016,8.3,Austria Trend Hotel Schloss Wilhelminenberg Wi...,United States of America,Not much,3,1558,Bugs in our room Bad Wifi,7,3,2.5,"[' Leisure trip ', ' Couple ', ' Classic Room ...",342 day,,,1
566,23 Rue Damr mont 18th arr 75018 Paris France,21,6/23/2016,8.3,Holiday Inn Paris Montmartre France,United Kingdom,N a,3,298,Great location friendly and very helpful staff,8,2,7.1,"[' Leisure trip ', ' Family with older childre...",406 day,,,1
724,Josefst dter Stra e 10 12 08 Josefstadt 1080 V...,333,8/6/2015,8.3,Flemings Selection Hotel Wien City Austria,United Arab Emirates,The bed so comfortable and room design,8,3672,Lovely location and friendly staff Close the ...,13,4,9.6,"[' Leisure trip ', ' Solo traveler ', ' Superi...",728 day,,,1
754,W hringer Stra e 33 35 09 Alsergrund 1090 Vien...,214,1/4/2016,7.8,Hotel Atlanta Austria,Romania,No Negative,0,2724,location near the city centre quiet clean,8,26,8.8,"[' Leisure trip ', ' Family with older childre...",577 day,,,1
1137,4 rue de la P pini re 8th arr 75008 Paris France,172,1/5/2016,8.8,Maison Albar Hotel Paris Le Diamond France,Luxembourg,The breakfast was to minimalist for this price,9,1524,No Positive,0,1,8.8,"[' Leisure trip ', ' Couple ', ' Executive Dou...",576 day,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515042,4 rue de la P pini re 8th arr 75008 Paris France,172,6/3/2016,8.8,Maison Albar Hotel Paris Le Diamond France,United Kingdom,Cost was considerably cheaper had I booked cl...,11,1524,Staff friendly Excellent Location Clean and s...,9,3,0.0,"[' Leisure trip ', ' Couple ', ' Superior Doub...",426 day,,,0
515133,Sieveringer Stra e 4 19 D bling 1190 Vienna Au...,115,8/8/2016,8.0,Derag Livinghotel Kaiser Franz Joseph Vienna A...,Austria,internet did not work in the room,8,1496,location public transport,4,1,0.0,"[' Leisure trip ', ' Couple ', ' Economy Doubl...",360 day,,,0
515273,Landstra er G rtel 5 03 Landstra e 1030 Vienna...,101,5/1/2017,8.5,Hotel Daniel Vienna Austria,Austria,No Negative,0,1405,Very alternative stylish and cozy place Excel...,37,14,0.0,"[' Leisure trip ', ' Group ', ' Standard Doubl...",94 days,,,0
515600,Savoyenstra e 2 16 Ottakring 1160 Vienna Austria,86,7/19/2016,8.3,Austria Trend Hotel Schloss Wilhelminenberg Wi...,France,There was no warm water The bathroom was very...,12,1558,No Positive,0,2,0.0,"[' Leisure trip ', ' Solo traveler ', ' Classi...",380 day,,,0


In [79]:
#pivot_n = data[['hotel_address', 'hotel_name']].groupby(
#    by='hotel_name'
#).nunique()
#pivot_n[pivot_n['hotel_address'] != 1]

Unnamed: 0_level_0,hotel_address
hotel_name,Unnamed: 1_level_1
Hotel Regina,3


In [None]:
#hotels['hotel_name'] = hotels.apply(
#    lambda row: (row['hotel_name'] + ' ' + row['hotel_address'].split()[-2])
#    if (row['hotel_name'] in name_rep)
#    else row['hotel_name'], axis=1)

In [13]:

print(colored('\nПроверим полученные изменения:', 'blue'))

[34m
Проверим полученные изменения:[0m


In [15]:
# Choosing country name from 'hotel_address'

data["hotel_name"] = data.apply(
    lambda row: (row["hotel_name"]+ ", " +
                 #(re.findall(r".*[(\d+)].(\w*)", row["hotel_address"])[0]) + " " +
                 (re.findall(r".*[(\d+)].\w*.(\w*.{,100})", row["hotel_address"])[0])), axis=1)
    #if (x["lat"]==0 | x["lng"].isna())
    #else x["hotel_name"],

In [128]:
ln = "jnelknw 7846 wef 8746  sts od am"
(re.findall(r".*[(\d+)].\w*.(\w*.{,100})", ln)[0])

'sts od am'

In [59]:
from geopy.geocoders import Bing  # Подключаем библиотеку
geolocator = Bing(api_key="ArAEkjZybNV7puDe4lgO9FsX8VssJ57er2SG0SvcuN3YxL0bZ5U9wZUtPlddLrVx")  # Указываем название приложения (так нужно, да)
adress = str('4 rue de la P pini re 8th arr 75008 Paris France')  # Получаем интересующий нас адрес
location = geolocator.geocode(adress)  # Создаем переменную, которая состоит из нужного нам адреса
#print(location) # Выводим результат: адрес в полном виде
print(location.latitude, location.longitude)  # И теперь выводим GPS-координаты нужного нам адреса

48.8741684 2.31111002


In [18]:
geolocator.geocode(adress).latitude

48.2203284

In [16]:
#data.fillna(0, inplace=True)

In [42]:
def latitude_set(address):
    location = geolocator.geocode(address)
    return location.latitude
def longitude_set(address):
    location = geolocator.geocode(address)
    return location.longitude

In [159]:
data.loc[(data["lat"].isna()), "lat"] = data.loc[(data["lat"].isna()), "hotel_name"].apply(latitude_set)

KeyboardInterrupt: 

In [70]:
hnames = data[data["lng"].isna()].groupby(["hotel_name"])[["hotel_name", "hotel_address"]].agg("unique_values")
#hnames = pd.DataFrame(hnames, columns=["hotel_name"])
#hnames["lat"] = hnames["hotel_name"].apply(latitude_set)
#hnames["lng"] = hnames["hotel_name"].apply(longitude_set)

AttributeError: 'unique_values' is not a valid function for 'DataFrameGroupBy' object

In [69]:
hnames


Unnamed: 0_level_0,hotel_name,hotel_address
hotel_name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Austria Trend Hotel Schloss Wilhelminenberg Wien, Austria",194,194
"City Hotel Deutschmeister, Austria",93,93
"Cordial Theaterhotel Wien, Austria",57,57
"Derag Livinghotel Kaiser Franz Joseph Vienna, Austria",147,147
"Fleming s Selection Hotel Wien City, Austria",658,658
"Holiday Inn Paris Montmartre, France",55,55
"Hotel Advance, Spain",28,28
"Hotel Atlanta, Austria",389,389
"Hotel City Central, Austria",563,563
"Hotel Daniel Vienna, Austria",245,245


In [131]:
def name_repair(name):
    if name.startswith("Fleming s Selection"):
        return name.replace("Fleming s", "Flemings")
    elif name.startswith("Maison Albar Hotel"):
        return name.replace("Op ra", "Le")
    else:
        return name
data["hotel_name"] = data["hotel_name"].apply(name_repair)

In [60]:
#import geocoder
g = geocoder.osm('Hotel Daniel Vienna')
#d = geocoder.osm((45.5331778, 9.1710849), method='reverse', language="en")
g.latlng
#d.address

[48.1828462, 16.378647618524546]

In [None]:
# Create a map
m_1 = folium.Map(tiles='openstreetmap', zoom_start=7) # location=[42.32,-71.0589]

for idx, row in data.head(30).iterrows():
    Marker([row['lat'], row['lng']]).add_to(m_1)

# Display the map
m_1

In [None]:
data['lat'] = data['lat'].fillna(0, inplace=True)
data['lng'] = data['lng'].fillna(0, inplace=True)

In [None]:
data.nunique(dropna=False)

In [None]:
plt.rcParams['figure.figsize'] = (15,10)
sns.heatmap(data.drop(['sample'], axis=1).corr(), annot=True)

In [None]:
# убираем признаки которые еще не успели обработать, 
# модель на признаках с dtypes "object" обучаться не будет, просто выберим их и удалим
object_columns = [s for s in data.columns if data[s].dtypes == 'object']
data.drop(object_columns, axis = 1, inplace=True)

In [None]:
data.info()

In [None]:
# Теперь выделим тестовую часть
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample'], axis=1)

y = train_data.reviewer_score.values            # наш таргет
X = train_data.drop(['reviewer_score'], axis=1)

In [None]:
# Воспользуемся специальной функцие train_test_split для разбивки тестовых данных
# выделим 20% данных на валидацию (параметр test_size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
# проверяем
test_data.shape, train_data.shape, X.shape, X_train.shape, X_test.shape

In [None]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [None]:
# Создаём модель (НАСТРОЙКИ НЕ ТРОГАЕМ)
model = RandomForestRegressor(n_estimators=100, verbose=1, n_jobs=-1, random_state=RANDOM_SEED)

In [None]:
# Обучаем модель на тестовом наборе данных
model.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = model.predict(X_test)

In [None]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAPE:', metrics.mean_absolute_error(y_test, y_pred))

In [None]:
# в RandomForestRegressor есть возможность вывести самые важные признаки для модели
plt.rcParams['figure.figsize'] = (10,10)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')

In [None]:
test_data.sample(10)

In [None]:
test_data = test_data.drop(['reviewer_score'], axis=1)

In [None]:
sample_submission

In [None]:
predict_submission = model.predict(test_data)

In [None]:
predict_submission

In [None]:
list(sample_submission)

In [None]:
sample_submission['reviewer_score'] = predict_submission
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)