In [None]:
# Install necessary pakcages
%%capture
#! pip install 

In [1]:
%%capture
import numpy as np
import pandas as pd
import re
import chardet.universaldetector
from IPython.display import display
import category_encoders as ce
from textblob import TextBlob
import nltk
nltk.download([
    "names",
    "stopwords",
    "twitter_samples",
    "movie_reviews",
    "vader_lexicon",
    "wordnet",
    "omw-1.4"
])

# Visualization
from rich import print
from rich.progress import track
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
import plotly
import plotly.express as px
import plotly.graph_objects as go

# Import geo services
import geopy
from geopy import distance
from geopy.geocoders import Bing  # library import
geolocator = Bing(api_key="ArAEkjZybNV7puDe4lgO9FsX8VssJ57er2SG0SvcuN3YxL0bZ5U9wZUtPlddLrVx")
import folium
from folium import Marker
from folium.plugins import MarkerCluster

# Load special module for data split
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk("/kaggle/input/"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip freeze > requirements.txt  # fixing versions of packages

In [2]:
RANDOM_SEED = 42  # fix random seed for repeatability of experiment
DATA_DIR = "Data/"

In [3]:
# Check dataset encoding
detector = chardet.UniversalDetector()
with open(DATA_DIR + "hotels_train.csv", "rb") as fh:
    for line in fh:
        detector.feed(line)
        if detector.done:
            break
detector.close()

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}

In [None]:
# Load data
df_train = pd.read_csv(DATA_DIR + "hotels_train.csv") # dataset for training
df_test = pd.read_csv(DATA_DIR + "hotels_test.csv") # dataset for prediction
sample_submission = pd.read_csv(DATA_DIR + "submission.csv") # submission

In [None]:
df_train.info()

In [None]:
df_train.head(2)

In [None]:
df_test.info()

In [None]:
df_test.head(2)

In [None]:
sample_submission.head(2)

In [None]:
sample_submission.info()

In [None]:
# Join train and test data for feature engineering
df_train["sample"] = 1 # train mark
df_test["sample"] = 0 # test mark
df_test["reviewer_score"] = 0 # we have to predict 'reviewer_score', so now it would be filled with 0 in test data

data = pd.concat([df_train, df_test], sort=False).reset_index(drop=True) # join datasets

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include="object")

### We will divide the features into two parts:<br>
 * *data related to **hotels**("hotel_address", "average_score", "hotel_name", "lat", "lng")*
 * *data related to **reviewers**(the rest features)*

### "The Features Of The Hotels"
Let's look at missing data in **latitudes** and **longitudes** of hotels geo positions:

In [None]:
def get_nan_cols(df):
    """"Count missing data in dataframe by feature in percents"""
    features_null_percent = df.isnull().mean().round(5) * 100
    cols_with_null = features_null_percent[features_null_percent>0].sort_values(ascending=False)
    display(cols_with_null)

In [None]:
print("[cyan]Percents of missing data[/cyan]")
get_nan_cols(data)
num = data[(data["lat"].isna()) | (data["lng"].isna())].shape[0]
print()
print(F"[cyan]Number of rows with missing data[/cyan]: [cyan bold]{num}[/cyan bold]")

In [None]:
# Look at unique hotels with missing data
hotels_null = data.loc[(data["lat"].isna()) | (data["lng"].isna()),
                       ["hotel_address", "hotel_name", "lat", "lng"]].drop_duplicates()
print("[cyan]Unique hotels with missing data[/cyan]")
display(hotels_null) 

# As I tried a lot of variants, this one is the most effective,
# but long enough for fixing data for correct geocoding
def address_repair(address):
    """Fixing values in hotel_address"""
    address = address.replace("tra e", "traße").replace("r nen", "rünen")\
    .replace("st dt", "städt").replace("mr mo", "mremo")\
    .replace("Sep lveda", "C/Sepulveda").replace("Eixample", " ")\
    .replace("W hringer", "Währinger Gürtel").replace("G rtel", "Gürtel")\
    .replace("D bling", " ").replace("P pini re", "Pépinière")\
    .replace("Ga t", "Gaîté").replace("Bail n", "Bailén")\
    .replace("Clar s", "Claris")
    return address

hotels_null["hotel_address"] = hotels_null["hotel_address"].apply(address_repair) # repairing hotel addresses
#hotels_null["lat"] = hotels_null["hotel_address"].apply(                        # latitude fixing
#                                    lambda x: geolocator.geocode(x).latitude)
#hotels_null["lng"] = hotels_null["hotel_address"].apply(                        # longitude fixing
#                                    lambda x: geolocator.geocode(x).longitude)

In [None]:
data = data.merge(hotels_null, how="left", on="hotel_name") # add filled data to main dataframe
data["lat"] = data["lat_x"].fillna(data["lat_y"]) # fix latitude in main dataframe
data["lng"] = data["lng_x"].fillna(data["lng_y"]) # fix longitude in main dataframe
data.drop(columns=["lat_x", "lng_x", "lat_y", "lng_y",    # drop unnecessary columns
                   "hotel_address_y"], inplace=True)
data.rename(columns = {"hotel_address_x":"hotel_address"}, inplace = True) # some makeup))

Now let's look at unique hotels throughout dataframe

In [None]:
hotels_unique = data[["hotel_address", "hotel_name", "lat", "lng"]].drop_duplicates()

# Let's look at the map
m_1 = folium.Map(tiles="openstreetmap", zoom_start=5, location=[48.779124, 9.180090]) # create the map

# Add points to the map
hotels = MarkerCluster()
for idx, row in hotels_unique.iterrows():
    hotels.add_child(Marker([row["lat"], row["lng"]]))
m_1.add_child(hotels)

# Display the map
m_1

As we can see, there are 1494 unique hotels with unique addresses in 6 cities. We will create 2 new features:<br>
city of hotel and distance to centre of city from hotel

In [None]:
# Set city of hotel
def get_city_hotel(address):
    """Get city from address"""
    if "United Kingdom" in address:
        city = "London"
    else:
        city = address.split()[-2]
    return city
hotels_unique["hotel_city"] = hotels_unique["hotel_address"].apply(get_city_hotel) # create hotel_city feature

# Import data of centres of the cities(source: https://simplemaps.com/data/world-cities)
centres_data = pd.read_csv(DATA_DIR + "ExtraData-for-booking-reviews-competitions/centres.csv")
print("[cyan]Dataset with centres coordinates[/cyan]")
display(centres_data)

hotels_unique = hotels_unique.merge(centres_data, how="left", on="hotel_city") # add centres of the cities
hotels_unique["cent_dist"] = hotels_unique.apply(lambda x: distance.distance(              # create distance to centre feature
                                                (x["lat"], x["lng"]),
                                                (x["cent_lat"], x["cent_lng"])).km, axis=1)

# Add data to main dataframe
data = data.merge(hotels_unique, how="left", on=["hotel_name", "lat", "lng"])
data.drop(["hotel_address_x", "hotel_address_y", "hotel_country",# drop unnecessary columns
           "lat", "lng", "cent_lat", "cent_lng"], axis=1, inplace=True)

data["hotel_name"] = data.apply(lambda x: (x["hotel_name"] + ", " + x["hotel_city"]), axis=1) # to distinguish hotels with similar names

Let's encode hotel names

In [None]:
# Choose unique hotels
mask = data["sample"] == 1 # as we have reviewers_score = 0 on sample = 0
hotels_unique = data[mask].groupby(["hotel_name", "total_number_of_reviews"])["reviewer_score"].agg(
                                                ["mean"]).reset_index().rename(columns={"mean":"mean_rev_score"})
mask_hotel = (hotels_unique["mean_rev_score"] >= 9) & (hotels_unique["total_number_of_reviews"] > 350)
filtered_hotels = hotels_unique[mask_hotel]
print(F"[cyan]Number of filtered hotels:[/cyan] [cyan bold]{len(filtered_hotels)}[/cyan bold]")

# Create filtered hotels names feature
hotels_unique["h_name_fix"] = hotels_unique["hotel_name"].apply(lambda name: name
                                                               if name in filtered_hotels["hotel_name"].values
                                                               else "other" )
uh = hotels_unique["h_name_fix"].nunique()
print(F"[cyan]Number of result hotels categories:[/cyan] [cyan bold]{uh}[/cyan bold]")

# Hotels names encoding
hotels_unique["h_name_fix"] = hotels_unique["h_name_fix"].astype("category") # for further encoding
ord_encoder = ce.OrdinalEncoder()
hotels_temp = ord_encoder.fit_transform(hotels_unique[["h_name_fix"]])
hotels_temp.rename(columns = {"h_name_fix":"hotel_enc"}, inplace = True) # rename to distinguish features
hotels_unique = pd.concat([hotels_unique, hotels_temp], axis= 1)

# Gather data into main dataframe
hotels_unique.drop(["h_name_fix", "mean_rev_score"], axis=1, inplace=True) # drop unnecessary column
data = data.merge(hotels_unique, how="left", on=["hotel_name", "total_number_of_reviews"])

Let's encode hotel cities

In [None]:
mask = data["sample"] == 1 # as we have reviewers_score = 0 on sample = 0

# Plot distribution of reviewer scores by city
fig = go.Figure()
fig.add_trace(go.Box(x=data.loc[mask & (data["hotel_city"]=="Amsterdam"),
                                "reviewer_score"], name= "Amsterdam ")),
fig.add_trace(go.Box(x=data.loc[mask & (data["hotel_city"]=="Barcelona"),
                                "reviewer_score"], name= "Barcelona ")),
fig.add_trace(go.Box(x=data.loc[mask & (data["hotel_city"]=="London"),
                                "reviewer_score"], name= "London ")),
fig.add_trace(go.Box(x=data.loc[mask & (data["hotel_city"]=="Milan"),
                                "reviewer_score"], name= "Milan ")),
fig.add_trace(go.Box(x=data.loc[mask & (data["hotel_city"]=="Paris"),
                                "reviewer_score"], name= "Paris ")),
fig.add_trace(go.Box(x=data.loc[mask & (data["hotel_city"]=="Vienna"),
                                "reviewer_score"], name= "Vienna "))
fig.update_layout(
    font_family="Helvetica",
    title={"text": "Distribution of reviewer scores by city",
            "x":0.08, "xanchor": "left",
            "y":0.9, "yanchor": "top",},
    xaxis_tickfont_size=14,
    yaxis=dict(tickfont_size=14),
    legend=dict(title = "cities:", 
                orientation="h", 
                y=1, yanchor="bottom", 
                x=1, xanchor="right"),
    width= 1300,
    height= 500,
)
fig.show("notebook")


We will divide cities into 3 groups by reviewers scores:
1. Barcelona & Vienna - 3
2. Amsterdam & Milan - 2
3. London & Paris - 1

In [None]:
# Encode cities
def city_enc(city):
    """Encode cities"""
    if (city == "Barcelona") or (city == "Vienna"):
        result = 3
    if (city == "Amsterdam") or (city == "Milan"):
        result = 2
    if (city == "London") or (city == "Paris"):
        result = 1
    return result
data["hotel_city_enc"] = data["hotel_city"].apply(city_enc) 

As we have average_score and reviewers score, so we can calculate their ratio<br>
If the ratio is less than 1, then the average score of the hotel is growning, and we would mark it with 1<br>
In other cases(ration equal or more than 1) it will be marked with 0 

In [None]:
# Filter necessary data 
mask = data["sample"] == 1 # as we have reviewers_score = 0 on sample = 0
average_pivot = data[mask].groupby(["hotel_name", "average_score"])["reviewer_score"].agg(                    
                                            ["mean"]).reset_index().rename(columns={"mean":"mean_rev_score"})

# Calculate ratio of scores
average_pivot["growth_index"] = round(average_pivot["average_score"]/average_pivot["mean_rev_score"], 3)  

# Mark growth
average_pivot["hotel_growth_enc"] = average_pivot["growth_index"].apply(lambda x: 1 if x < 1 else 0) 
print(F"[cyan]Result pivot_table[/cyan]")
display(average_pivot)

average_pivot.drop(["mean_rev_score", "growth_index"], axis=1, inplace=True) # drop unnecessary columns
data = data.merge(average_pivot, how="left", on=["hotel_name", "average_score"]) # add new feature

### "Fantastic Reviewers and Where to Find Them"

We have features: **additional_number_of_scoring**(scores without reviews) and **total_number_of_reviews**(scores with reviews).<br>
Well, scores without reviews are rather **suspicious**, so let's calculate the ratio of number of scores without reviews and number of all scores. 

In [None]:
# Calculate ratio of scores
all_scores = data["additional_number_of_scoring"] + data["total_number_of_reviews"] 
data["score_ratio"] = round(data["additional_number_of_scoring"] / all_scores, 2) * 100  # calculating in percents
print(F"[cyan]Unique ratio values[/cyan]")
pprint(data["score_ratio"].value_counts(), width=79, compact=True)

""" 
    As we can see, there are different values from 2 percents to 28 percents.
    I think it is very suspicious, when ratio was higher than 25 percents.
    So let's devide ration in 3 groups: ratio <= 20%, 20 < ratio < 25, ratio >= 25% 
"""

# Encode level of suspicion
def ratio_encoding(ratio):
    """Encode level of suspicion"""
    if ratio <= 20:
        x = 0
    if (ratio > 20) and (ratio < 25):
        x = 1
    if ratio >= 25:
        x = 2
    return x

data["suspicion_enc"] = data["score_ratio"].apply(ratio_encoding)
data.drop(["score_ratio"], axis=1, inplace=True)
print()
print(F"[cyan]Amount of hotels by level of suspicion[/cyan]")
data["suspicion_enc"].value_counts()  # show amount of hotels by level of suspicion

Let's make **date** become date and **days** become number

In [None]:
# review_date fix
data["review_date"] = pd.to_datetime(data["review_date"]) 

# days_since_review fix
data["days_since_review"] = data["days_since_review"].apply(lambda x: int(x.split()[0]))

Now we will look at **reviewer nationality**.<br>
Reviewers' nationalities(countries) would be associated with their continents and thеn encoded.<br>
The data of countries and their continents was downloaded from: https://worldpopulationreview.com/country-rankings/list-of-countries-by-continent<br>
and adapted for our purposes. 

In [None]:
# Load data of countries by continents
continents = pd.read_csv(DATA_DIR + "ExtraData-for-booking-reviews-competitions/continents.csv") 
print("[cyan]Data of countries by continents[/cyan]")
display(continents)

# Add data to main dataframe
data["reviewer_nationality"] = data["reviewer_nationality"].apply(lambda x: x.strip())
data = data.merge(continents, how="left", left_on="reviewer_nationality", right_on="country")

# Now lets check our data
print("[cyan]Columns with nulls[/cyan]")
get_nan_cols(data)
print()
print("[cyan]Null data[/cyan]")
display(data.loc[data["continent"].isna(), "reviewer_nationality"].value_counts())

# Oooops! We have NaN data for unrecognized countries
# But the amount of missing values is rather small, so just let's fill them with mode
data["continent"].fillna(data["continent"].mode()[0], inplace=True)

# Now let's look at the distribution of reviewers score by continents
mask = data["sample"] == 1 # as we have reviewers_score = 0 on sample = 0
fig = px.box(
    data[mask],
    x = "reviewer_score",
    y = "continent",
    color = "continent", 
    width= 1300,
    height= 500,
    labels = {"reviewer_score":"Reviewers' Scores"}
)
fig.update_layout(
    font_family="Helvetica",
    title={"text": "Distribution of reviewers' scores by reviewers' continents",
            "x":0.1, "xanchor": "left",
            "y":0.98, "yanchor": "top",},
    xaxis=dict(tickfont_size=14),
    yaxis=dict(tickfont_size=14, title=""),
    legend=dict(title = "continets:", 
                orientation="h", 
                y=1, yanchor="bottom", 
                x=1, xanchor="right")
)
fig.show("notebook")

As we can see above, there is very clear differentiation of reviewers scores by continents.<br>
Now let's encode continents by scoring

In [None]:
# Encode reviewers continents
def continents_encode(continent):
    if continent == "North America":
        x = 6
    if continent == "Oceania":
        x = 5
    if continent == "South America":
        x = 4
    if continent == "Africa":
        x = 3
    if continent == "Europe":
        x = 2
    if continent == "Asia":
        x = 1
    return x

data["rev_continent_enc"] = data["continent"].apply(continents_encode) # encode reviewers' continents
data.drop(["reviewer_nationality", "country", "continent"], axis=1, inplace=True) # drop unnecessary columns

**"TAGS"**

In [None]:
# Tags overview
def raw_tag_grab(tag):
    """Get list of tags from raw string"""
    lst = tag.lower().lstrip("[' ").rstrip(" ']").replace("' ", "").replace(" '", "").split(",")
    tag_list = list(map(lambda tag: tag.strip(), lst))
    return tag_list

data["tag_list"] = data["tags"].apply(raw_tag_grab) # make list of tags
data["num_tag"] = data["tag_list"].apply(lambda tag_list: len(tag_list)) # calculate length of tags

# Look at the structure of the tags
mn = data["num_tag"].min() 
print(F"[cyan]The minimum number of tags:[/cyan] [cyan bold]{mn}[/cyan bold]")
pprint(data.loc[data["num_tag"]==1, "tag_list"], width=79, compact=True)
print()
mx = data["num_tag"].max() 
print(F"[cyan]The maximum number of tags:[/cyan] [cyan bold]{mx}[/cyan bold]")
pprint(data.loc[data["num_tag"]==6, "tag_list"], width=79, compact=True)

As we can see, **the maximum** number of tags is **6**. They have the structure:
 - with pet * 
 - type of trip *
 - type of reviewer
 - room description *
 - amount of stayed nights *
 - hotel room was submitted from mobile device *<br>

\* - *maybe nothing if less than the maximum*

**The minimum** number of tags is **1**, there is only type of reviewer.<br>
Now we can encode this tags

In [None]:
# Encode with a pet tag
def pet_tag_set(tag_list):
    """Encode pet_tag"""
    if "with a pet" in tag_list:
        pet = 1
        tag_list.remove("with a pet")
    else:
        pet = 0
    return pet

# Encode type of trip tag
def trip_tag_set(tag_list):
    """Encode trip_tag"""
    if "leisure trip" in tag_list:
        trip = 2
        tag_list.remove("leisure trip")
    elif "business trip" in tag_list:
        trip = 1
        tag_list.remove("business trip")
    else:
        trip = 0
    return trip

# Encode number of stayed nights tag
def nights_tag_set(tag_list):
    """Encode nights_tag"""
    for tag in tag_list:
        if re.match(r"stayed\s\d+\snights?", tag):
            nights = int(re.findall(r".(\d+).*", tag)[0])
            tag_list.remove(tag)
        else:
                nights = 0
    return nights

# Encode submitted from a mobile device tag
def mob_dev_tag_set(tag_list):
    """Encode mob_dev_tag"""
    if "submitted from a mobile device" in tag_list:
        mob_dev = 1
        tag_list.remove("submitted from a mobile device")
    else:
        mob_dev = 0
    return mob_dev

data["pet_tag"] = data["tag_list"].apply(pet_tag_set) 
sum_pet = data["pet_tag"].sum()
print(F"[cyan]Number of reviewers with pet:[/cyan] [cyan bold]{sum_pet}[/cyan bold]")

data["trip_tag"] = data["tag_list"].apply(trip_tag_set)
bus_trip = data.loc[data["trip_tag"] == 1, "trip_tag"].count()
print(F"[cyan]Number of reviewers on a business trip:[/cyan] [cyan bold]{bus_trip}[/cyan bold]")

data["nights_tag"] = data["tag_list"].apply(nights_tag_set)
nght = data.loc[data["nights_tag"] == 1, "nights_tag"].count()
print(F"[cyan]Number of reviewers stayed for 1 night:[/cyan] [cyan bold]{nght}[/cyan bold]")

data["mob_dev_tag"] = data["tag_list"].apply(mob_dev_tag_set)
mob_dev = data["mob_dev_tag"].sum()
print(F"[cyan]Number of reviewers submitted from a mobile device:[/cyan] [cyan bold]{mob_dev}[/cyan bold]")

In [None]:
# Get sets of unique reviewers' types and types of rooms
room_set = set()
traveller_set = set()

def room_type_grab(tag_list):
    """Fill set of types of rooms tag"""
    global room_set
    if len(tag_list) > 1:
        room_set.add(tag_list[1])
        
def trav_type_grab(tag_list):
    """Fill set of reviewers' types tag"""
    global traveller_set
    traveller_set.add(tag_list[0])

data["tag_list"].apply(room_type_grab)
print(F"[cyan]Number of unique types of rooms:[/cyan] [cyan bold]{len(room_set)}[/cyan bold]")

data["tag_list"].apply(trav_type_grab)
print(F"[cyan]Unique reviewers' types:[/cyan] /n[cyan bold]{traveller_set}[/cyan bold]")

# Create reviewer's type feature
data["rev_type"] = data["tag_list"].apply(lambda x: x[0] 
                                          if (x[0] in traveller_set) 
                                          else "other")  

# Create features from type_of_room tag:
def type_of_room_set(tag_list):
    """Create type of room feature"""
    types = ["premier", "superior", "deluxe", "classic", "guestroom", "standard", 
             "king", "queen", "club", "luxury", "suite", "executive", "junior", 
             "family", "basic"]
    for tp in types:
        if len(tag_list) > 1:
            if tp in tag_list[1]:
                return tp
            if ("no window" in tag_list[1]) or ("without window" in tag_list[1]):
                return "no window"
    return "other"


data["room_type"] = data["tag_list"].apply(type_of_room_set)  # create type of room feature
data["roomX2"] = data["tag_list"].apply(lambda x: 1                                # feature for double rooms
                                        if ((len(x)>1) and (("double" in x[1]) or  
                                        ("twin" in x[1]) or ("duplex" in x[1]))) 
                                        else 0)
data["roomX3"] = data["tag_list"].apply(lambda x: 1                            # feature for triple rooms
                                        if ((len(x)>1) and ("triple" in x[1])) 
                                        else 0)  
data["roomX4"] = data["tag_list"].apply(lambda x: 1                               # feature for quadruple rooms
                                        if ((len(x)>1) and ("quadruple" in x[1])) 
                                        else 0)  
data["with_view"] = data["tag_list"].apply(lambda x: 1                          # feature for rooms with nice view 
                                           if ((len(x)>1) and ("view" in x[1])) 
                                           else 0)          


In [None]:
# Encode reviewers' types and types of rooms
# Reviewers' types
rev_encoder = ce.OneHotEncoder(cols=["rev_type"], use_cat_names=True)
rev_data= rev_encoder.fit_transform(data["rev_type"])
data = pd.concat([data, rev_data], axis=1)

# Types of rooms
room_encoder = ce.OneHotEncoder(cols=["room_type"], use_cat_names=True)
room_data = room_encoder.fit_transform(data["room_type"])
data = pd.concat([data, room_data], axis=1)

# Make heavy light
data[
    ["hotel_enc", "hotel_city_enc", "hotel_growth_enc", "suspicion_enc", 
      "rev_continent_enc", "num_tag", "pet_tag", "trip_tag", "nights_tag", 
      "mob_dev_tag", "roomX2", "roomX3", "roomX4", "with_view", "rev_type_couple", 
      "rev_type_solo traveler", "rev_type_family with young children",
      "rev_type_group", "rev_type_family with older children", 
      "rev_type_travelers with friends", "room_type_suite", "room_type_standard", 
      "room_type_other", "room_type_superior", "room_type_king", "room_type_luxury", 
      "room_type_executive", "room_type_family", "room_type_deluxe", 
      "room_type_no window", "room_type_premier", "room_type_basic", 
      "room_type_classic", "room_type_queen", "room_type_guestroom", "room_type_club", 
      "room_type_junior"]
    ] = data[
    ["hotel_enc", "hotel_city_enc", "hotel_growth_enc", "suspicion_enc", 
      "rev_continent_enc", "num_tag", "pet_tag", "trip_tag", "nights_tag", 
      "mob_dev_tag", "roomX2", "roomX3", "roomX4", "with_view", "rev_type_couple", 
      "rev_type_solo traveler", "rev_type_family with young children",
      "rev_type_group", "rev_type_family with older children", 
      "rev_type_travelers with friends", "room_type_suite", "room_type_standard", 
      "room_type_other", "room_type_superior", "room_type_king", "room_type_luxury", 
      "room_type_executive", "room_type_family", "room_type_deluxe", 
      "room_type_no window", "room_type_premier", "room_type_basic", 
      "room_type_classic", "room_type_queen", "room_type_guestroom", "room_type_club", 
      "room_type_junior"]
     ].astype("int8")

In [None]:
# Let's look at the distribution of reviewers' scores by lengths of tags
mask = data["sample"] == 1 # as we have reviewers_score = 0 on sample = 0
fig = go.Figure()
fig.add_trace(go.Box(y=data.loc[mask & (data["num_tag"]==1),
                                                    "reviewer_score"], name= "1 tag ")),
fig.add_trace(go.Box(y=data.loc[mask & (data["num_tag"]==2),
                                                    "reviewer_score"], name= "2 tags ")),
fig.add_trace(go.Box(y=data.loc[mask & (data["num_tag"]==3),
                                                    "reviewer_score"], name= "3 tags ")),
fig.add_trace(go.Box(y=data.loc[mask & (data["num_tag"]==4),
                                                    "reviewer_score"], name= "4 tags ")),
fig.add_trace(go.Box(y=data.loc[mask & (data["num_tag"]==5),
                                                    "reviewer_score"], name= "5 tags ")),
fig.add_trace(go.Box(y=data.loc[mask & (data["num_tag"]==6),
                                                    "reviewer_score"], name= "6 tags "))
fig.update_layout(
    font_family="Helvetica",
    title={"text": "Distribution of reviewers' scores by number of tags",
            "x":0.07, "xanchor": "left",
            "y":0.9, "yanchor": "top",},
    xaxis=dict(tickfont_size=14),
    yaxis=dict(tickfont_size=14, title = "Score"),
    legend=dict(title = "number of tags:", 
                orientation="h", 
                y=1, yanchor="bottom", 
                x=1, xanchor="right"),
    width= 1200,
    height= 600
)
fig.show("notebook")

In [None]:
# As we can see, there is direct correlation between reviewers' scores and number of tags
# So, let's encode them
def num_tag_enc(length):
    if length == 1:
        result = 1
    elif length == 2:
        result = 2
    elif (length >= 3) and (length <= 5):
        result = 3
    elif length == 6:
        result = 4
    return result

data["num_tag_enc"] = data["num_tag"].apply(num_tag_enc)

In [None]:
# Let's look at the distribution of reviewers' scores by reviewers' types
mask = data["sample"] == 1 # as we have reviewers_score = 0 on sample = 0
rev_types_pivot = pd.pivot_table(data[mask],
                                 index="rev_type", 
                                 values="reviewer_score", 
                                 aggfunc="mean").reset_index()
fig = px.bar(
    rev_types_pivot,
    x = "rev_type",
    y = round(rev_types_pivot["reviewer_score"], 3),
    color = "rev_type",
    labels = {"rev_type":"Reviewers' types"},
    width= 1200,
    height= 600,
    text_auto = True
)
fig.update_layout(
    font_family="Helvetica",
    title={"text": "Distribution of reviewers' scores by reviewers' types",
            "x":0.07, "xanchor": "left",
            "y":0.97, "yanchor": "top",},
    xaxis=dict(tickfont_size=14),
    yaxis=dict(tickfont_size=14, title = "Scores"),
    legend=dict(title = "reviewers' types:"))
fig.update_traces(textfont_size=14, textposition="outside")
fig.show("notebook")

As we can see, mean reviewers' scores are mostly similar for each type of travellers. But the **lowest values** are for **solo travellers**<br>
and for **families with young children**. I think we have such result because people from this types have **less emotional involovement**<br>
and so they are **more critical**.

In [None]:
# Let's look at the distribution of reviewers' scores by types of rooms
mask = data["sample"] == 1 # as we have reviewers_score = 0 on sample = 0
room_types_pivot = pd.pivot_table(data[mask],
                                 index="room_type", 
                                 values="reviewer_score", 
                                 aggfunc="mean").reset_index()
fig = px.bar(
    room_types_pivot,
    x = "room_type",
    y = round(room_types_pivot["reviewer_score"], 2),
    color = "room_type",
    labels = {"room_type":"Types of Rooms"},
    width= 1200,
    height= 600,
    text_auto = True
)
fig.update_layout(
    font_family="Helvetica",
    title={"text": "Distribution of reviewers' scores by types of rooms",
            "x":0.07, "xanchor": "left",
            "y":0.97, "yanchor": "top",},
    xaxis=dict(tickfont_size=14),
    yaxis=dict(tickfont_size=14, title = "Scores"),
    legend=dict(title = "types of rooms:"))
fig.update_traces(textfont_size=14, textposition="outside")
fig.show("notebook")

On the plot above we can mark such a thing: **the lowest scores** are for tags, which charcterize the rooms like **basic, guestroom or room without window**.<br> On the other hand **the highest scores** are for tags, which tell us about **premium segment** of rooms.<br>
I think, it’s pretty obvious.

In [None]:
# Drop unnecessary columns
data.drop(["tags" ,"tag_list", "room_type", "rev_type", "num_tag"], axis=1, inplace=True)

### "Reviews"

In [None]:
data.head(5)

In [None]:
garbage = nltk.corpus.stopwords.words("english")
garbage.extend([w.lower() for w in nltk.corpus.names.words()])

In [None]:
def get_pos_rev(rev):
    rev_str = str()  
    rev_raw_str = rev.lower()
    pos_list = ["no positive", "nothing", "na", "n a", "nohing"]
    for i in pos_list:
        if rev == i:
            rev = "negative"
    rev_lst = [word.strip() for word in nltk.word_tokenize(rev)]
    for word in rev_lst:
        if word.isalpha() and (word not in garbage) and \
        (len(word) > 1) and ((word != "") or (word != " ")):
            word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
            rev_str = rev_raw_str + word + " "
    return rev_str
data["positive_review_fixed"] = data["positive_review"].apply(get_pos_rev)

In [None]:
def get_neg_rev(rev):
    rev_str = str()  
    rev_raw_str = rev.lower()
    pos_list = ["no positive", "nothing", "na", "n a", "nohing"]
    for i in pos_list:
        if rev == i:
            rev = "negative"
    rev_lst = [word.strip() for word in nltk.word_tokenize(rev)]
    for word in rev_lst:
        if word.isalpha() and (word not in garbage) and \
        (len(word) > 1) and ((word != "") or (word != " ")):
            word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
            rev_str = rev_raw_str + word + " "
    return rev_str
data["positive_review_fixed"] = data["positive_review"].apply(get_pos_rev)

In [None]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()
# 2. Lemmatize Single Word with the appropriate POS tag
word = 'feet'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))
# 3. Lemmatize a Sentence with the appropriate POS tag
sentence = "The striped bats are hanging on their feet for best"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])
#> ['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']

In [None]:
pd.set_option('display.max_rows', None)
data["negative_review"] = data["negative_review"].apply(lambda x: x.lower().strip()) 
def grab_text(text):
    text = text.strip()
    neg_list = ["no negative", "nothing", "na", "n a", "nohing", "", " ", "nil", "no", "non", "all"
               "nada", "nithing", "noting", "nope", "nothings", "nothin", "zero", "exclent", "none", "null",
               "none really", "nothing really", "all good", "no complaints", "nothing at all", "absolutely nothing",
               "nothing to dislike", "everything was perfect", "nothing all good", "everything was good"]
    for i in neg_list:
        if text == i:
            text = "positive"
    if len(text) < 2:
        text = "positive"
    return text

data["negative_review"] = data["negative_review"].apply(grab_text) 
data.loc[data["review_total_negative_word_counts"] < 5, "negative_review"].value_counts()


In [None]:
data.loc[data["review_total_negative_word_counts"] == 2, "negative_review"].value_counts()

In [None]:
get_nan_cols(data)

In [None]:
plt.rcParams['figure.figsize'] = (15,10)
sns.heatmap(data.drop(['sample'], axis=1).corr(), annot=True)

In [None]:
# убираем признаки которые еще не успели обработать, 
# модель на признаках с dtypes "object" обучаться не будет, просто выберим их и удалим
object_columns = [s for s in data.columns if data[s].dtypes == 'object']
data.drop(object_columns, axis = 1, inplace=True)

In [None]:
data.info()

In [None]:
# Теперь выделим тестовую часть
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample'], axis=1)

y = train_data.reviewer_score.values            # наш таргет
X = train_data.drop(['reviewer_score'], axis=1)

In [None]:
# Воспользуемся специальной функцие train_test_split для разбивки тестовых данных
# выделим 20% данных на валидацию (параметр test_size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
# проверяем
test_data.shape, train_data.shape, X.shape, X_train.shape, X_test.shape

In [None]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [None]:
# Создаём модель (НАСТРОЙКИ НЕ ТРОГАЕМ)
model = RandomForestRegressor(n_estimators=100, verbose=1, n_jobs=-1, random_state=RANDOM_SEED)

In [None]:
# Обучаем модель на тестовом наборе данных
model.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = model.predict(X_test)

In [None]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAPE:', metrics.mean_absolute_error(y_test, y_pred))

In [None]:
# в RandomForestRegressor есть возможность вывести самые важные признаки для модели
plt.rcParams['figure.figsize'] = (10,10)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')

In [None]:
test_data.sample(10)

In [None]:
test_data = test_data.drop(['reviewer_score'], axis=1)

In [None]:
sample_submission

In [None]:
predict_submission = model.predict(test_data)

In [None]:
predict_submission

In [None]:
list(sample_submission)

In [None]:
sample_submission['reviewer_score'] = predict_submission
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)