In [1]:
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
import folium
import seaborn as sb
from IPython.display import HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics.pairwise import cosine_similarity
import math


# Enable inline plotting
%matplotlib inline

#### Create a dataframe from data in train.csv

In [2]:
train_path = r'../../data/train.csv'
train_original = pd.read_csv(train_path)
train = train_original.fillna(method='ffill')


# Task 1: Data Exploration

### Query 1.1
#### Most popular room type

In [None]:
q1 = train.groupby(['room_type']).count().sort_values(by='id', ascending=False)
q1 = q1.rename(columns={q1.columns[0]: 'Count'})
q1 = q1.reset_index()
pal = sb.husl_palette(8, h=.8, l=.4)
sb.barplot(x='room_type', y=q1.columns[1], data = q1, palette = pal)
q1.filter(items = ['room_type', 'Count'])

### Query 1.2
#### Price fluctuation over the course of 3 months

In [None]:
q2 = train.groupby(['month']).mean().filter(items=['price'])
q2 = q2.rename(columns={'price':'mean price'})
q2 = q2.reset_index()
sb.lineplot(x='month', y=q2.columns[1], data = q2)
q2

### Query 1.3
#### Top 5 neighbourhoods with the most reviews

In [None]:
q3 = train.groupby(['neighbourhood']).sum()
q3 = q3.sort_values('number_of_reviews', ascending = False)
q3 = q3.head(5).filter(items=['number_of_reviews'])
q3 = q3.reset_index()
pal = sb.husl_palette(8, h=.9, l=.4)
sb.barplot(x='neighbourhood', y=q3.columns[1], data=q3, palette=pal)
q3

### Query 1.4
#### Neighbourhood with the most listings

In [None]:
q4 = train.groupby(['neighbourhood']).count()
q4 = q4.sort_values(['id'], ascending = False).head(5)
q4 = q4.reset_index()
pal = sb.husl_palette(8, l=.4)
sb.barplot(x='neighbourhood', y=q4.columns[1], data = q4, palette=pal)

### Query 1.5
#### Listings per neighbourhood and per month

##### Per neighbourhood

In [None]:
q5_neigh = train.groupby(['neighbourhood']).count()
q5_neigh = q5_neigh.rename(columns={'id': 'Count'})
q5_neigh = q5_neigh.filter(items=['neighbourhood','Count'])
q5_neigh = q5_neigh.reset_index()
fig = plt.subplots(figsize=(15, 17))
chart = sb.barplot(x='neighbourhood', y='Count', data=q5_neigh)
chart.set_xticklabels(chart.get_xticklabels(), rotation=60)
q5_neigh


##### Per month

In [None]:
q5_month = train.groupby(['month']).count()
q5_month = q5_month.rename(columns={'id': 'count'})
q5_month = q5_month.filter(items=['count'])
q5_month = q5_month.reset_index()
pal = sb.husl_palette(8, h=.5)
sb.barplot(x='month', y=q5_month.columns[1], data = q5_month, palette = pal)
q5_month

### Query 1.6
#### A histogram of the variable 'neighbourhood

In [None]:
q6 = train.drop_duplicates(subset='id').groupby(['neighbourhood']).count()
q6 = q6.rename(columns = {'id':'count'})
q6.hist(column = 'count')

### Query 1.7
#### Most popular room type in each neighbourhood

In [None]:
q7 = train.groupby(['neighbourhood','room_type']).count().sort_values('neighbourhood')#.groupby(level=0).head(1)
q7 = q7.rename(columns={q7.columns[0]: 'Count'})
q7 = q7.reset_index(level = ('neighbourhood', 'room_type'))
fig = plt.subplots(figsize=(15, 17))
pal = sb.husl_palette(8, h=.7, l=.6)
chart = sb.barplot(x='neighbourhood', y='Count', hue='room_type', data=q7, palette=pal)
chart.set_xticklabels(chart.get_xticklabels(), rotation=60)
q7.filter(items=['neighbourhood','room_type','Count'])

### Query 1.8
#### Most expensive room type (using mean prices)

In [None]:
q8 = train.groupby(['room_type']).mean()
q8 = q8.sort_values(['price'], ascending=False)
q8 = q8.filter(items=['price'])
q8 = q8.reset_index()
sb.barplot(x='room_type', y=q8.columns[1], data = q8)
q8.head(1)

### Query 1.9
#### A map of some listings in april

In [None]:
map = folium.Map(location = [37.983810, 23.727539], zoom_start=13)
#map

In [None]:
q9 = train[train.month == '04_April'].filter(items=['name', 'description', 'neighbourhood', 'latitude','longitude', 'room_type', 'price'])
q9 = q9.sample(1000, random_state=30)

for index, row in q9.iterrows():
    pop = row['room_type'] + ' ' + '$' +  str(row['price'])
    folium.Marker([row['latitude'], row['longitude']], popup=pop, tooltip = row['name']).add_to(map)

map

In [None]:
#map.save('april_listings.html')

In [None]:
#HTML(filename='april_listings.html')

### Query 1.10
#### Wordclouds

#### From column 'description'

In [None]:
text = ', '.join(train.description).lower()
# text = list(train.description.values)
# print(text)
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'gray',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (20, 15),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

#### From 'neighbourhood'

In [None]:
text = ', '.join(train.neighbourhood)
# text

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'gray',
    collocations=False,
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

#### For transit

In [None]:
text = ' '.join(train.transit).lower()

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'gray',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

#### From variable 'last review'

In [None]:
text = ' '.join(train.comments).lower()

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'gray',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

# Task 2: Recommendation


In [3]:
recommend = train_original.drop_duplicates(subset='id').fillna(value='NULL').filter(items=['id', 'name', 'description'])
# recommend

In [4]:
recommend['name_desc'] = recommend['name'] + recommend['description']
recommend = recommend.drop_duplicates(subset='name_desc')
recommend

Unnamed: 0,id,name,description,name_desc
0,10595,"96m2, 3BR, 2BA, Metro, WI-FI etc...",Athens Furnished Apartment No6 is 3-bedroom ap...,"96m2, 3BR, 2BA, Metro, WI-FI etc...Athens Furn..."
1,10988,"75m2, 2-br, metro, wi-fi, cable TV",Athens Furnished Apartment No4 is 2-bedroom ap...,"75m2, 2-br, metro, wi-fi, cable TVAthens Furni..."
2,10990,"50m2, Metro, WI-FI, cableTV, more",Athens Furnished Apartment No3 is 1-bedroom ap...,"50m2, Metro, WI-FI, cableTV, moreAthens Furnis..."
3,10993,"Studio, metro, cable tv, wi-fi, etc",The Studio is an -excellent located -close t...,"Studio, metro, cable tv, wi-fi, etcThe Studio ..."
4,10995,"47m2, close to metro,cable TV,wi-fi",AQA No2 is 1-bedroom apartment (47m2) -excell...,"47m2, close to metro,cable TV,wi-fiAQA No2 is ..."
...,...,...,...,...
21441,33587252,"sunshiny room, in the center of Athens","sunshiny room with privet bathroom, and privet...","sunshiny room, in the center of Athenssunshiny..."
21442,33595046,Modern Elegant Apartment at Kolonaki,Often hotels and apartments end up neglecting ...,Modern Elegant Apartment at KolonakiOften hote...
21443,33608367,Acropolis Mini Loft,"Located at the Acropolis area in "" Koukaki "" n...",Acropolis Mini LoftLocated at the Acropolis ar...
21444,33628045,"Your Beloved Flat in Plaka, close to everything!","This brand-new, chic decorated studio is the p...","Your Beloved Flat in Plaka, close to everythin..."


In [5]:
text = recommend.name_desc.values
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words= 'english')
X = vectorizer.fit_transform(text)
# print(vectorizer.get_feature_names())

In [6]:
cs = cosine_similarity(X,X)
cσ


(7775, 7775)

In [16]:
# To check the id.
recommend[recommend.id == 7017342] 

Unnamed: 0,id,name,description,name_desc
770,7017342,Room 3,"Квартира находится на 2 ом этаже. Большая, про...",Room 3Квартира находится на 2 ом этаже. Больша...


In [17]:
# To check the id.
recommend[recommend.id == 7423458]

Unnamed: 0,id,name,description,name_desc
801,7423458,Room 1,"Квартира находится на 2 ом этаже. Большая, про...",Room 1Квартира находится на 2 ом этаже. Больша...


In [11]:
scores = []
for i in range(cs.shape[0]):
    for j in range(i+1,cs.shape[1]):
        scores.append((cs[i,j], i, j))

scores = list(filter(lambda x:(math.floor(x[0]) != 1), scores))
scores = sorted(scores, key = lambda x:x[0], reverse = True)
scores = scores[:100]
# scores
similar_ids = {}
for i in range(0,100):
    id = recommend.values[(scores[i])[1]][0]
    similar_ids[id] = recommend.values[(scores[i])[2]][0]

similar_ids

{1223199: 29057234,
 25941499: 25941675,
 30424563: 26497308,
 31509913: 31561260,
 31560654: 31561260,
 32649434: 32650553,
 32650034: 32650553,
 32650458: 32650553,
 18030214: 31450371,
 24425858: 31461934,
 4440310: 32921918,
 23225556: 23225775,
 22669275: 22714490,
 11366771: 11402677,
 31560982: 31561260,
 16473889: 16551779,
 11559962: 11562618,
 11542772: 11559962,
 23664459: 26273246,
 17216563: 17216641,
 8365419: 18652795,
 8326092: 13486998,
 25515948: 26118850,
 26118125: 26118850,
 21847866: 29810117,
 25941657: 25941675,
 17966450: 17966565,
 7017342: 7423458,
 25767004: 27377919,
 27942508: 30338408,
 25649489: 29248657,
 26094355: 26117905,
 15793210: 27774209,
 26093644: 28639109,
 31876674: 31942992,
 25650084: 25936053,
 21388422: 21442536,
 21193882: 21194366,
 23902991: 23942086,
 31509474: 31560802,
 31509785: 31560802,
 30457277: 29506716,
 28712552: 28712157,
 22074163: 22074541,
 22074774: 22075076,
 30439676: 30514220,
 31450371: 31560654,
 28996770: 28999145