Usaré los datos de los comentarios (reviews) y los combinaré con algunas características de los datos detallados de los listados de Berlín, obtenidos del sitio web de Inside Airbnb. Ambos conjuntos de datos se extrajeron el 07 de noviembre de 2018.

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as matplt

import warnings
warnings.filterwarnings("ignore")

import time
import datetime

In [4]:
dataset_reviews = pd.read_csv('reviews_summary.csv')
dataset_reviews.head(5)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2015,69544350,2016-04-11,7178145,Rahel,Mein Freund und ich hatten gute gemütliche vie...
1,2015,69990732,2016-04-15,41944715,Hannah,Jan was very friendly and welcoming host! The ...
2,2015,71605267,2016-04-26,30048708,Victor,Un appartement tres bien situé dans un quartie...
3,2015,73819566,2016-05-10,63697857,Judy,"It is really nice area, food, park, transport ..."
4,2015,74293504,2016-05-14,10414887,Romina,"Buena ubicación, el departamento no está orden..."


In [5]:
dataset_listings = pd.read_csv('listings.csv')
dataset_listings.head(6)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2015,Berlin-Mitte Value! Quiet courtyard/very central,2217,Ian,Mitte,Brunnenstr. Süd,52.534537,13.402557,Entire home/apt,60,4,118,2018-10-28,3.76,4,141
1,2695,Prenzlauer Berg close to Mauerpark,2986,Michael,Pankow,Prenzlauer Berg Nordwest,52.548513,13.404553,Private room,17,2,6,2018-10-01,1.42,1,0
2,3176,Fabulous Flat in great Location,3718,Britta,Pankow,Prenzlauer Berg Südwest,52.534996,13.417579,Entire home/apt,90,62,143,2017-03-20,1.25,1,220
3,3309,BerlinSpot Schöneberg near KaDeWe,4108,Jana,Tempelhof - Schöneberg,Schöneberg-Nord,52.498855,13.349065,Private room,26,5,25,2018-08-16,0.39,1,297
4,7071,BrightRoom with sunny greenview!,17391,Bright,Pankow,Helmholtzplatz,52.543157,13.415091,Private room,42,2,197,2018-11-04,1.75,1,26
5,9991,Geourgeous flat - outstanding views,33852,Philipp,Pankow,Prenzlauer Berg Südwest,52.533031,13.416047,Entire home/apt,180,6,6,2018-07-23,0.15,1,137


In [6]:
data_merged = pd.merge(dataset_reviews, dataset_listings[['host_id', 'id','neighbourhood_group','latitude', 'longitude', 'number_of_reviews', 'price']],
                                    left_on='listing_id', right_on='id', how='left')
data_merged.drop(['id_y'], axis=1, inplace=True)
data_merged.head(5)

Unnamed: 0,listing_id,id_x,date,reviewer_id,reviewer_name,comments,host_id,neighbourhood_group,latitude,longitude,number_of_reviews,price
0,2015,69544350,2016-04-11,7178145,Rahel,Mein Freund und ich hatten gute gemütliche vie...,2217,Mitte,52.534537,13.402557,118,60
1,2015,69990732,2016-04-15,41944715,Hannah,Jan was very friendly and welcoming host! The ...,2217,Mitte,52.534537,13.402557,118,60
2,2015,71605267,2016-04-26,30048708,Victor,Un appartement tres bien situé dans un quartie...,2217,Mitte,52.534537,13.402557,118,60
3,2015,73819566,2016-05-10,63697857,Judy,"It is really nice area, food, park, transport ...",2217,Mitte,52.534537,13.402557,118,60
4,2015,74293504,2016-05-14,10414887,Romina,"Buena ubicación, el departamento no está orden...",2217,Mitte,52.534537,13.402557,118,60


In [7]:
# Agrupamos por host para determinar un top de los más comentados
count_per_host = pd.DataFrame(data_merged.groupby('host_id')['listing_id', 'price'].nunique())

# sort unique values descending and show the Top20
count_per_host.sort_values(by=['listing_id'], ascending=False, inplace=True)
count_per_host.head(20)

Unnamed: 0_level_0,listing_id,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1625771,44,8
8250486,38,16
2293972,28,16
59442407,19,11
87442687,16,10
54942921,16,15
8912086,15,6
34705166,15,14
79430806,15,10
52747435,15,10


In [8]:
price_max = data_merged[data_merged['price'] == data_merged.groupby('neighbourhood_group')['price'].transform('max')]
count_per_neighbourhood = pd.DataFrame(data_merged.groupby('neighbourhood_group')['listing_id', 'price'].nunique())
count_per_neighbourhood.sort_values(by=['listing_id'], ascending=False, inplace=True)
count_per_neighbourhood.head()

Unnamed: 0_level_0,listing_id,price
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1
Friedrichshain-Kreuzberg,4601,183
Mitte,3831,216
Pankow,2956,171
Neukölln,2915,124
Tempelhof - Schöneberg,1286,132


In [26]:
price_max = price_max[['neighbourhood_group', 'price']].drop_duplicates()
price_max.sort_values(by=['price'], ascending=False, inplace=True)
price_max.head(5)

Unnamed: 0,neighbourhood_group,price
207787,Charlottenburg-Wilm.,9000
400693,Friedrichshain-Kreuzberg,8600
392006,Tempelhof - Schöneberg,6000
262658,Lichtenberg,5000
174187,Pankow,4240


In [10]:
data_merged.isna().sum()

listing_id               0
id_x                     0
date                     0
reviewer_id              0
reviewer_name            0
comments               496
host_id                  0
neighbourhood_group      0
latitude                 0
longitude                0
number_of_reviews        0
price                    0
dtype: int64

In [14]:
data_merged.dropna(inplace=True)

In [24]:
from langdetect import detect

ModuleNotFoundError: No module named 'langdetect'

In [19]:
data_merged['text'] = data_merged['comments'].apply(detect)


NameError: name 'detect' is not defined