In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

pd.options.mode.chained_assignment = None
plt.style.use('classic')
plt.style.use('seaborn-ticks')
plt.style.use('seaborn-darkgrid')
plt.style.use('dark_background')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
listings = pd.read_csv("../input/seattle/listings.csv")

In [None]:
listings

In [None]:
#filtering relevant columns
listings = listings[["id","name","summary","space","description","neighbourhood","neighborhood_overview"]]

In [None]:
listings.rename({'neighborhood_overview': 'neighbourhood_overview','id':'listing_id'}, axis=1, inplace=True)

In [None]:
listings

<br>

# Cleaning

In [None]:
listings["neighbourhood"].nunique()

In [None]:
listings.isna().sum()

In [None]:
listings.isna().sum()/len(listings)

In [None]:
listings.isna().sum().sum()

In [None]:
listings.dropna()["neighbourhood"].nunique()

In order to work with clean data, as it's string data, I will drop NaN values and hence be resigning one neighbourhood for analysis.

In [None]:
listings = listings.dropna()

In [None]:
listings

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(listings["neighbourhood"].value_counts())

In [None]:
calendar = pd.read_csv("../input/seattle/calendar.csv")

In [None]:
calendar.isna().sum()

In [None]:
len(calendar)

In [None]:
pd.isna(calendar["price"].iloc[2])

In [None]:
calendar.dropna()["listing_id"].nunique()

In [None]:
calendar['price'] = calendar['price'].str.replace('$', '')
calendar['price'] = calendar['price'].str.replace(',', '')

In [None]:
calendar["price"] = pd.to_numeric(calendar["price"])

In [None]:
mean_price_by_listing = calendar.groupby("listing_id")["price"].mean().to_frame().reset_index()

In [None]:
mean_price_by_listing

In [None]:
mean_price_by_listing["price"] = mean_price_by_listing["price"].round(2)

In [None]:
mean_price_by_listing

In [None]:
listings = pd.merge(listings,mean_price_by_listing)

In [None]:
listings.isna().sum()

I will input the NaN values by the median price of their correspondent neighbourhood.

In [None]:
listings.loc[pd.isnull(listings["price"])==True]

In [None]:
listings["price"] = listings.groupby("neighbourhood")["price"].transform(lambda x: x.fillna(x.mean()))

In [None]:
listings["price"] = listings["price"].round(2)

In [None]:
listings.isna().sum()

In [None]:
listings.iloc[108]

*Note: I will refer to the mean price of each listing throughout time simply as **"price"** as it describes the actual listing price thoroughly.*

<br>

# Exploratory Data Analysis (EDA)

In [None]:
sns.distplot(listings["price"], color="white")
plt.title("Distribution of prices across listings")

In [None]:
sns.boxplot(listings["price"])

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder().fit(listings["neighbourhood"])
listings["nbh_cat"] = encoder.transform(listings["neighbourhood"])

In [None]:
listings

In [None]:
plt.figure(figsize=(10,20))
plt.title("Density of neighbourhoods across listings data")
sns.countplot(y=listings["neighbourhood"], order=listings["neighbourhood"].value_counts().to_frame().reset_index()["index"])

In [None]:
top_expensive = listings.groupby("neighbourhood")["price"].mean().to_frame().reset_index().sort_values(["price"], ascending=False).iloc[:10]
top_expensive

In [None]:
top_cheap = listings.groupby("neighbourhood")["price"].mean().to_frame().reset_index().sort_values(["price"], ascending=False).iloc[-10:].iloc[::-1]
top_cheap

In [None]:
plt.title("Top 10 most expensive neighbourhoods")
sns.barplot(x=top_expensive["price"],y=top_expensive["neighbourhood"], palette="coolwarm_r")
plt.xlabel("Mean listing price")

In [None]:
plt.title("Top 10 cheapest neighbourhoods")
sns.barplot(x=top_cheap["price"],y=top_cheap["neighbourhood"], palette="coolwarm")
plt.xlabel("Mean listing price")

In [None]:
def AdjectivesCollector(text_to_process):
    adjs = []
    sentences = nltk.sent_tokenize(text_to_process)

    for sentence in sentences:
         for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
             if (pos == 'JJ' or pos == 'JJR' or pos == 'JJS'):
                 adjs.append(word)
    return adjs

In [None]:
full_adj_list = []
for i in range(len(listings)):
    a = AdjectivesCollector(listings["description"][i])
    full_adj_list.append(a)

In [None]:
listings["adjectives"] = full_adj_list

create new dataframe where each neighbourhood contains a flatten list corpus of all the listings it contains, then, i can perform collocations, freqdist, etc on each one.

In [None]:
neighbourhoods = listings["neighbourhood"].unique()

In [None]:
adjectives_per_neighbourhood = listings.groupby(["neighbourhood"])["adjectives"].agg(np.sum).to_frame().reset_index()

In [None]:
n_listings = listings["neighbourhood"].value_counts().to_frame().reset_index()
n_listings.rename(columns={"index": "neighbourhood","neighbourhood":"n_listings"},inplace=True)

In [None]:
adjectives_per_neighbourhood = adjectives_per_neighbourhood.merge(n_listings)

In [None]:
adjectives_per_neighbourhood["n_words"] = adjectives_per_neighbourhood["adjectives"].apply(np.size)

In [None]:
listings["description_array"] = listings["description"].apply(lambda x: [str(x)])
adjectives_per_neighbourhood["corpus"] = listings.groupby(["neighbourhood"])["description_array"].agg(np.sum).to_frame().reset_index()["description_array"]

In [None]:
adjectives_per_neighbourhood["size_corpus"] = adjectives_per_neighbourhood["corpus"].apply(np.size)

In [None]:
len(adjectives_per_neighbourhood.iloc[0])

In [None]:
adjectives_per_neighbourhood["corpus"][0]

In [None]:
 from nltk.probability import FreqDist
FreqDist(adjectives_per_neighbourhood["adjectives"][2])

**inspiration:** correlation between the amount of times the word "beautiful" appears and the price of listings, neighbourhood, etc. be creative.