# packages and db

In [148]:
import pymongo
import pandas as pd
myclient = pymongo.MongoClient('mongodb://localhost:27017')
scrapedb = myclient['scrapedb']
adcollection = scrapedb['ads']

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [149]:
q0 = list(adcollection.aggregate([
	{'$group':{'_id':'global',
			'scraped_date_max':{'$max':'$scraped timestamp'}
		}
  
  }
	
]))[0]['scraped_date_max']
	
print(f"Last item scraped: {q0}")

Last item scraped: 2020-06-14 19:39:07.672000


# COUNTS

## number of all documents

In [150]:
q1 = adcollection.count_documents({})
print(f"Number of all documents: {q1}")

Number of all documents: 6203


## number of scraped documents per week

In [151]:
q2 = list(adcollection.aggregate([
    {'$project':{'yearweek':{'$concat':[{'$toString':{'$year':'$scraped timestamp'}},
                                       {'$toString':{'$week':'$scraped timestamp'}}]
                            }
                }
    },
    {'$group':{'_id':'$yearweek', 'scraped_documents_count':{'$sum':1}}}
    
    
]))

df2 = pd.json_normalize(q2)\
    .rename(columns = {'_id':'yearweek'})\
    .sort_values('yearweek')
df2

Unnamed: 0,yearweek,scraped_documents_count
3,202019,312
2,202020,305
0,202021,800
4,202022,1594
1,202023,1597
5,202024,1595


## number of distinct ads

In [152]:
q3 = len(adcollection.distinct('properties.link'))
print(f"Number of distinct ads: {q3}")

Number of distinct ads: 3418


## number of documents with tags

In [153]:
q4 = adcollection.count_documents({'tags.1':{'$exists':True}})
print(f"Number of documents with atleast 1 tag: {q4}")

Number of documents with atleast 1 tag: 2753


## counts of tags

In [154]:
q4_1 = list(adcollection.aggregate([

	{'$unwind':'$tags'},
	{'$group':{'_id':'$tags', 'tag_count':{'$sum':1}
		}
  
  }
	
]))
	
df4_1 = pd.json_normalize(q4_1)\
    .rename(columns = {'_id':'tag'})\
    .sort_values('tag_count', ascending = False)
df4_1.head(20)

Unnamed: 0,tag,tag_count
73,Výťah,2119
49,Plastové okná,1552
80,Nákupné centrum v blízkosti,1540
65,Blízkosť zastávky MHD,1540
6,Balkón,1200
58,Tichá lokalita,1179
70,Škola v blízkosti,1167
17,Zateplený bytový dom,1059
42,Nádherný výhľad,949
56,loggia,934


## number of documents with map coordinates

In [155]:
q5 = adcollection.count_documents({'mapcoord.lon':{'$exists':True}})
print(f"Number of documents with map coordinates: {q5}")

Number of documents with map coordinates: 5364


## counts of keys in properties

In [156]:
q5_1 = list(adcollection.aggregate([
	{'$project':{
		'properties':{'$objectToArray':'$properties'}
		}
  
  },
	{'$unwind':'$properties'},
	{'$group':{'_id':'$properties.k',
			'documents_count':{'$sum':1}
		}
  
  }
	
]))
	
df5_1 = pd.json_normalize(q5_1)\
    .rename(columns = {'_id':'property'})\
    .sort_values('documents_count', ascending = False)
df5_1

Unnamed: 0,property,documents_count
11,Balkón / loggia,6203
13,Lokalita,6203
20,Kategória,6203
18,link,6203
15,Pivnica,6203
14,Cena dohodou,6203
21,Aktualizácia,6203
10,Úžitková plocha,6175
4,Provízia v cene,6007
0,Cena,6007


## counts of distinct values in properties

In [157]:
q5_2 = list(adcollection.aggregate([
	{'$project':{
		'properties':{'$objectToArray':'$properties'}
		}
  
  },
	{'$unwind':'$properties'},
	{'$group':{'_id':'$properties.k',
			'unique_values':{'$addToSet': "$properties.v"}
		}
    },
     {'$project':{'_id':1,
                 'unique_values_count':{'$size':'$unique_values'}}
     
     }
  

	
]))
	
df5_2 = pd.json_normalize(q5_2)\
    .rename(columns = {'_id':'property'})\
    .sort_values('unique_values_count', ascending = False)
df5_2

Unnamed: 0,property,unique_values_count
18,link,3418
8,Identifikačné číslo:,2519
21,Aktualizácia,1803
16,Cena za meter,1518
0,Cena,988
7,Ulica,712
10,Úžitková plocha,210
13,Lokalita,65
1,Zastavaná plocha,49
5,Pozemok m2,43


## number of documents per flat category

In [158]:
q6 = list(adcollection.aggregate([
{'$group':{'_id':'$properties.Kategória', 'documents_count':{'$sum':1}}}]))

df6 = pd.json_normalize(q6)\
    .rename(columns = {'_id':'flat_category'})\
    .sort_values('documents_count', ascending = False)
df6

Unnamed: 0,flat_category,documents_count
0,2 izbový byt,2119
1,3 izbový byt,2009
5,4 izbový byt,871
6,1 izbový byt,629
3,Garsónka,266
8,5 izbový byt a viac,166
7,Dvojgarsónka,87
2,Mezonet,26
4,Apartmán,17
9,Iný byt,13


## number of documents per state and category

In [159]:
q8 = list(adcollection.aggregate([
{'$group':{'_id':{'kategoria':'$properties.Kategória', 'stav':'$properties.Stav'}, 'documents_count':{'$sum':1}}}]))

df8 = pd.json_normalize(q8)\
    .rename(columns = {'_id.kategoria':'category', '_id.stav':'state'})\
    .sort_values(['state', 'documents_count'], ascending = False)
df8 = df8[['state', 'category', 'documents_count']]
df8 = df8.pivot(index='state', columns='category', values='documents_count')
df8['state_count'] = df8.sum(axis = 1)
df8 = df8.sort_values('state_count', ascending = False)
df8

category,1 izbový byt,2 izbový byt,3 izbový byt,4 izbový byt,5 izbový byt a viac,Apartmán,Dvojgarsónka,Garsónka,Iný byt,Mezonet,state_count
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kompletná rekonštrukcia,178.0,869.0,882.0,296.0,23.0,2.0,39.0,102.0,4.0,9.0,2404.0
,244.0,732.0,471.0,308.0,97.0,10.0,6.0,34.0,3.0,11.0,1916.0
čiastočná rekonštrukcia,108.0,274.0,400.0,176.0,21.0,,34.0,68.0,,,1081.0
pôvodný,66.0,105.0,172.0,54.0,15.0,4.0,8.0,61.0,,,485.0
vo výstavbe,33.0,139.0,84.0,37.0,10.0,1.0,,1.0,6.0,6.0,317.0


## number of documents per location and category

### counts

In [160]:
q10 = list(adcollection.aggregate([
{'$group':{'_id':{'kategoria':'$properties.Kategória', 'lokalita':'$properties.Lokalita'}, 'documents_count':{'$sum':1}}}]))

df10 = pd.json_normalize(q10)\
    .rename(columns = {'_id.kategoria':'category', '_id.lokalita':'location'})
df10 = df10.pivot(index='location', columns='category', values='documents_count')
df10['location_count'] = df10.sum(axis = 1)
df10 = df10.sort_values('location_count', ascending = False)
df10.head(30)

category,1 izbový byt,2 izbový byt,3 izbový byt,4 izbový byt,5 izbový byt a viac,Apartmán,Dvojgarsónka,Garsónka,Iný byt,Mezonet,location_count
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Bratislava I, Staré Mesto",88.0,421.0,417.0,166.0,53.0,7.0,,35.0,3.0,10.0,1200.0
"Bratislava II, Ružinov",143.0,504.0,313.0,106.0,1.0,1.0,8.0,66.0,1.0,5.0,1148.0
"Bratislava V, Petržalka",102.0,265.0,474.0,114.0,15.0,3.0,49.0,72.0,3.0,5.0,1102.0
"Bratislava III, Nové Mesto",44.0,253.0,172.0,90.0,27.0,5.0,,22.0,6.0,2.0,621.0
"Bratislava IV, Dúbravka",28.0,92.0,94.0,79.0,8.0,,2.0,25.0,,2.0,330.0
"Bratislava IV, Karlova Ves",53.0,71.0,127.0,56.0,21.0,,,,,,328.0
"Bratislava II, Vrakuňa",10.0,71.0,58.0,96.0,6.0,,22.0,4.0,,,267.0
"Bratislava III, Rača",41.0,122.0,38.0,30.0,6.0,,,5.0,,,242.0
"Bratislava II, Podunajské Biskupice",21.0,62.0,89.0,21.0,3.0,,,1.0,,,197.0
"Bratislava IV, Devínska Nová Ves",25.0,51.0,49.0,11.0,,,,8.0,,,144.0


### percent

In [161]:
df11 = df10.iloc[:,[i for i in range(0, 10)]].apply(lambda x: x/df10['location_count'])
df11.head(30)

category,1 izbový byt,2 izbový byt,3 izbový byt,4 izbový byt,5 izbový byt a viac,Apartmán,Dvojgarsónka,Garsónka,Iný byt,Mezonet
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"Bratislava I, Staré Mesto",0.073333,0.350833,0.3475,0.138333,0.044167,0.005833,,0.029167,0.0025,0.008333
"Bratislava II, Ružinov",0.124564,0.439024,0.272648,0.092334,0.000871,0.000871,0.006969,0.057491,0.000871,0.004355
"Bratislava V, Petržalka",0.092559,0.240472,0.430127,0.103448,0.013612,0.002722,0.044465,0.065336,0.002722,0.004537
"Bratislava III, Nové Mesto",0.070853,0.407407,0.276973,0.144928,0.043478,0.008052,,0.035427,0.009662,0.003221
"Bratislava IV, Dúbravka",0.084848,0.278788,0.284848,0.239394,0.024242,,0.006061,0.075758,,0.006061
"Bratislava IV, Karlova Ves",0.161585,0.216463,0.387195,0.170732,0.064024,,,,,
"Bratislava II, Vrakuňa",0.037453,0.265918,0.217228,0.359551,0.022472,,0.082397,0.014981,,
"Bratislava III, Rača",0.169421,0.504132,0.157025,0.123967,0.024793,,,0.020661,,
"Bratislava II, Podunajské Biskupice",0.106599,0.314721,0.451777,0.106599,0.015228,,,0.005076,,
"Bratislava IV, Devínska Nová Ves",0.173611,0.354167,0.340278,0.076389,,,,0.055556,,


### mean percent of category in top few locations

In [162]:
locs = ['Bratislava I, Staré Mesto',
'Bratislava II, Ružinov',
'Bratislava V, Petržalka',
'Bratislava III, Nové Mesto',
'Bratislava IV, Dúbravka',
'Bratislava IV, Karlova Ves',
'Bratislava II, Vrakuňa',
'Bratislava III, Rača',
'Bratislava II, Podunajské Biskupice',
'Bratislava IV, Devínska Nová Ves',
'Bratislava V, Jarovce',
'Bratislava IV, Lamač']

df11.loc[locs,:].mean()


category
1 izbový byt           0.117012
2 izbový byt           0.370737
3 izbový byt           0.283599
4 izbový byt           0.172217
5 izbový byt a viac    0.033353
Apartmán               0.004370
Dvojgarsónka           0.034973
Garsónka               0.041714
Iný byt                0.003939
Mezonet                0.007106
dtype: float64

# AVG PRICE

In [163]:
q12 = list(adcollection.aggregate([
	{'$group':{'_id':'global',
			'price_avg':{'$avg':'$properties.Cena'}
		}
  
  }
	
]))[0]['price_avg']
	
print(f"Average price of whole set: {q12}")


Average price of whole set: 205411.40902280674


## avg price per category

In [164]:
q13 = list(adcollection.aggregate([
	{'$group':{'_id':{'category':'$properties.Kategória'},
			'price_avg':{'$avg':'$properties.Cena'}}
		
  
  }
	
]))
	
df13 = pd.json_normalize(q13)\
    .rename(columns = {'_id.category':'category'})\
    .sort_values('price_avg', ascending = False)
df13 = df13[['category', 'price_avg']]
df13

Unnamed: 0,category,price_avg
8,5 izbový byt a viac,485004.466216
2,Mezonet,389037.26087
5,4 izbový byt,294894.968009
1,3 izbový byt,225116.520308
0,2 izbový byt,172282.578717
4,Apartmán,163682.647059
9,Iný byt,160472.153846
6,1 izbový byt,122310.87725
7,Dvojgarsónka,101431.785714
3,Garsónka,89986.079545


## avg price per location

In [165]:
q14 = list(adcollection.aggregate([
	{'$group':{'_id':{'location':'$properties.Lokalita'},
			'price_avg':{'$avg':'$properties.Cena'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df14 = pd.json_normalize(q14)\
    .rename(columns = {'_id.location':'location'})\
    .sort_values('price_avg', ascending = False)
df14 = df14[['location', 'price_avg', 'document_count']]
df14.head(30)

Unnamed: 0,location,price_avg,document_count
61,"Bratislava IV, Karlova Ves, časť Rovnice",507666.666667,3
54,"Bratislava III, Nové Mesto, časť Koliba",375525.806452,31
38,"Bratislava I, Staré Mesto",295909.654867,1200
50,"Bratislava IV, Karlova Ves, časť Dlhé diely",290714.923077,27
35,"Bratislava III, Nové Mesto, časť Vinohrady",288356.3,10
58,"Bratislava IV, Devín",280000.0,2
22,"Bratislava V, Jarovce",279209.677419,62
43,"Bratislava III, Nové Mesto",236777.189097,621
60,"Bratislava III, Rača, časť Rača",225437.727273,11
8,"Bratislava IV, Záhorská Bystrica",217536.933333,30


### in top few locations

In [166]:
df15 = df14.loc[df14['location'].isin(locs),:]
df15

Unnamed: 0,location,price_avg,document_count
38,"Bratislava I, Staré Mesto",295909.654867,1200
22,"Bratislava V, Jarovce",279209.677419,62
43,"Bratislava III, Nové Mesto",236777.189097,621
34,"Bratislava IV, Karlova Ves",206689.221184,328
55,"Bratislava IV, Dúbravka",197053.557927,330
47,"Bratislava II, Ružinov",183471.490179,1148
0,"Bratislava V, Petržalka",167333.277008,1102
52,"Bratislava III, Rača",160581.970213,242
5,"Bratislava II, Podunajské Biskupice",152294.645503,197
28,"Bratislava IV, Lamač",148439.0,52


## avg price per location and category

In [167]:
q16 = list(adcollection.aggregate([
	{'$group':{'_id':{'location':'$properties.Lokalita', 'category':'$properties.Kategória'},
			'price_avg':{'$avg':'$properties.Cena'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df16 = pd.json_normalize(q16)\
    .rename(columns = {'_id.location':'location', '_id.category':'category'})\
    .sort_values('price_avg', ascending = False)
df16 = df16[['location', 'category', 'price_avg', 'document_count']]
df16.head(30)

Unnamed: 0,location,category,price_avg,document_count
153,"Bratislava IV, Karlova Ves, časť Dlhé diely",5 izbový byt a viac,636000.0,6
133,"Bratislava III, Nové Mesto",5 izbový byt a viac,630303.478261,27
69,"Bratislava III, Nové Mesto, časť Koliba",4 izbový byt,629130.0,10
137,"Bratislava IV, Dúbravka",5 izbový byt a viac,606027.0,8
96,"Bratislava I, Staré Mesto",Mezonet,559268.375,10
107,"Bratislava V, Petržalka",5 izbový byt a viac,521985.571429,15
183,"Bratislava IV, Karlova Ves",5 izbový byt a viac,519900.0,21
31,"Bratislava IV, Karlova Ves, časť Rovnice",4 izbový byt,507666.666667,3
203,"Bratislava I, Staré Mesto",5 izbový byt a viac,489550.536585,53
177,"Bratislava I, Staré Mesto",4 izbový byt,416415.299363,166


In [168]:
df17 = df16.loc[df16['location'].isin(locs),:].pivot(index='location', columns='category', values='price_avg')
df17

category,1 izbový byt,2 izbový byt,3 izbový byt,4 izbový byt,5 izbový byt a viac,Apartmán,Dvojgarsónka,Garsónka,Iný byt,Mezonet
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"Bratislava I, Staré Mesto",163157.068182,234694.744949,330524.416667,416415.299363,489550.536585,212040.0,,118002.911765,258000.0,559268.375
"Bratislava II, Podunajské Biskupice",91054.210526,126361.967213,178046.880952,168288.095238,258533.333333,,,79990.0,,
"Bratislava II, Ružinov",122821.142857,168031.861224,218539.53268,294058.695238,270000.0,117000.0,81340.0,89691.469697,118900.0,275200.0
"Bratislava II, Vrakuňa",92798.0,99754.521127,165970.827586,194372.166667,226666.333333,,51280.0,63870.0,,
"Bratislava III, Nové Mesto",123542.690476,174086.232932,270060.04321,373096.184211,630303.478261,110800.0,,78201.818182,144248.0,355000.0
"Bratislava III, Rača",111581.4,150161.271186,209716.111111,203363.333333,249832.833333,,,80960.0,,
"Bratislava IV, Devínska Nová Ves",102974.583333,134792.0,150399.555556,239590.909091,,,,71275.0,,
"Bratislava IV, Dúbravka",107867.035714,143553.423913,199979.860215,281253.679487,606027.0,,116900.0,90017.8,,269000.0
"Bratislava IV, Karlova Ves",118501.730769,170153.768116,188814.919355,261312.321429,519900.0,,,,,
"Bratislava IV, Lamač",114266.4,151223.333333,188644.444444,,,,,113888.0,,


## avg price per state

In [169]:
q18 = list(adcollection.aggregate([
	{'$group':{'_id':{'state':'$properties.Stav'},
			'price_avg':{'$avg':'$properties.Cena'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df18 = pd.json_normalize(q18)\
    .rename(columns = {'_id.state':'state'})\
    .sort_values('price_avg', ascending = False)
df18 = df18[['state', 'price_avg', 'document_count']]
df18

Unnamed: 0,state,price_avg,document_count
0,,243411.327332,1916
3,vo výstavbe,221839.599349,317
4,kompletná rekonštrukcia,200969.203098,2404
2,pôvodný,169243.840671,485
1,čiastočná rekonštrukcia,161207.198874,1081


## avg price per state and category

In [170]:
q19 = list(adcollection.aggregate([
	{'$group':{'_id':{'state':'$properties.Stav', 'category':'$properties.Kategória'},
			'price_avg':{'$avg':'$properties.Cena'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df19 = pd.json_normalize(q19)\
    .rename(columns = {'_id.state':'state', '_id.category':'category'})\
    .sort_values(['state', 'price_avg'], ascending = False)
df19 = df19[['state', 'category', 'price_avg', 'document_count']]
df19 = df19.pivot(index = 'state', columns = 'category', values = 'price_avg')
df19

category,1 izbový byt,2 izbový byt,3 izbový byt,4 izbový byt,5 izbový byt a viac,Apartmán,Dvojgarsónka,Garsónka,Iný byt,Mezonet
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,123168.679654,181989.755274,277830.10989,367272.501718,511396.172414,188538.0,132833.333333,88640.882353,258000.0,559268.375
kompletná rekonštrukcia,131011.442529,175565.905549,225842.752086,274886.979452,478312.5,114950.0,124928.205128,97816.647059,111662.5,291555.555556
pôvodný,111840.0,148977.254902,195240.758824,264104.867925,377357.0,106000.0,49306.25,81037.616667,,
vo výstavbe,127517.878788,182764.175573,231410.134146,330171.351351,563321.6,243325.0,,170850.0,144248.0,308285.0
čiastočná rekonštrukcia,111163.17757,139777.985019,174913.0401,207802.099415,415237.142857,,83974.117647,85554.179104,,


## avg price per state and location

In [171]:
q20 = list(adcollection.aggregate([
	{'$group':{'_id':{'location':'$properties.Lokalita', 'state':'$properties.Stav'},
			'price_avg':{'$avg':'$properties.Cena'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df20 = pd.json_normalize(q20)\
    .rename(columns = {'_id.location':'location', '_id.state':'state'})\
    .sort_values('price_avg', ascending = False)
df20 = df20[['location', 'state', 'price_avg', 'document_count']]
df20 = df20.loc[df20['location'].isin(locs),:].pivot(index = 'location', columns = 'state', values = 'price_avg')
df20

state,NaN,kompletná rekonštrukcia,pôvodný,vo výstavbe,čiastočná rekonštrukcia
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Bratislava I, Staré Mesto",349779.927798,287589.050162,243326.050505,305419.692308,252078.6
"Bratislava II, Podunajské Biskupice",163745.285714,153636.594595,105470.0,159000.0,134020.967742
"Bratislava II, Ružinov",204870.817204,179133.601351,132358.980769,189686.196721,151664.792308
"Bratislava II, Vrakuňa",193305.341176,139251.238095,102318.457143,227050.0,107902.662162
"Bratislava III, Nové Mesto",300491.239234,202411.621053,206935.084746,177886.396825,216838.181818
"Bratislava III, Rača",185904.195122,142423.82716,155568.157895,,148373.5
"Bratislava IV, Devínska Nová Ves",139347.058824,158441.034483,128080.0,155075.384615,125848.93617
"Bratislava IV, Dúbravka",237524.097087,149110.04717,132501.153846,422054.615385,148424.925373
"Bratislava IV, Karlova Ves",296542.692308,186042.26087,199250.0,234883.333333,159497.44898
"Bratislava IV, Lamač",164308.0,137506.4,109000.0,,125333.333333


## avg price per state, location and category

In [172]:
q21 = list(adcollection.aggregate([
	{'$group':{'_id':{'location':'$properties.Lokalita', 'state':'$properties.Stav', 'category':'$properties.Kategória'},
			'price_avg':{'$avg':'$properties.Cena'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df21 = pd.json_normalize(q21)\
    .rename(columns = {'_id.location':'location', '_id.state':'state', '_id.category':'category'})\
    .sort_values('price_avg', ascending = False)
df21 = df21.loc[df21['location'].isin(locs),:].pivot_table(index = ['location', 'category'], columns = 'state', values = ['price_avg', 'document_count'])
df21.columns = df21.columns.swaplevel(0, 1)
df21.sort_index(axis=1, level=0, inplace=True)
df21

Unnamed: 0_level_0,state,kompletná rekonštrukcia,kompletná rekonštrukcia,pôvodný,pôvodný,vo výstavbe,vo výstavbe,čiastočná rekonštrukcia,čiastočná rekonštrukcia
Unnamed: 0_level_1,Unnamed: 1_level_1,document_count,price_avg,document_count,price_avg,document_count,price_avg,document_count,price_avg
location,category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
"Bratislava I, Staré Mesto",1 izbový byt,45.0,179111.022222,13.0,140080.0,,,21.0,136651.761905
"Bratislava I, Staré Mesto",2 izbový byt,236.0,236022.640351,23.0,201394.782609,18.0,251137.5,20.0,180694.058824
"Bratislava I, Staré Mesto",3 izbový byt,239.0,322860.207965,53.0,270367.150943,3.0,290000.0,46.0,256723.111111
"Bratislava I, Staré Mesto",4 izbový byt,77.0,420835.118421,6.0,343540.0,7.0,436101.714286,25.0,386210.0
"Bratislava I, Staré Mesto",5 izbový byt a viac,19.0,551861.538462,5.0,374750.0,,,5.0,347600.0
"Bratislava I, Staré Mesto",Apartmán,1.0,112900.0,,,,,,
"Bratislava I, Staré Mesto",Garsónka,29.0,124824.103448,2.0,90000.0,,,,
"Bratislava II, Podunajské Biskupice",1 izbový byt,3.0,96000.0,2.0,80000.0,,,1.0,84990.0
"Bratislava II, Podunajské Biskupice",2 izbový byt,35.0,118309.411765,4.0,103490.0,,,7.0,105114.285714
"Bratislava II, Podunajské Biskupice",3 izbový byt,29.0,183830.965517,3.0,134900.0,3.0,150000.0,14.0,138421.428571


# AVG M2 PRICE

In [173]:
q12 = list(adcollection.aggregate([
	{'$group':{'_id':'global',
			'price_m2_avg':{'$avg':'$properties.Cena za meter'}
		}
  
  }
	
]))[0]['price_m2_avg']
	
print(f"Average price of whole set: {q12}")


Average price of whole set: 3644.52144772118


## avg m2 price per category

In [174]:
q13 = list(adcollection.aggregate([
	{'$group':{'_id':{'category':'$properties.Kategória'},
			'price_m2_avg':{'$avg':'$properties.Cena za meter'}}
		
  
  }
	
]))
	
df13 = pd.json_normalize(q13)\
    .rename(columns = {'_id.category':'category'})\
    .sort_values('price_m2_avg', ascending = False)
df13 = df13[['category', 'price_m2_avg']]
df13

Unnamed: 0,category,price_m2_avg
4,1 izbový byt,8604.530378
0,Garsónka,3568.498069
2,2 izbový byt,3268.77566
8,Mezonet,3252.521739
7,3 izbový byt,3044.910355
3,Apartmán,3032.941176
1,Iný byt,2929.538462
9,5 izbový byt a viac,2699.593103
5,4 izbový byt,2678.190132
6,Dvojgarsónka,2458.916667


## avg m2 price per location

In [175]:
q14 = list(adcollection.aggregate([
	{'$group':{'_id':{'location':'$properties.Lokalita'},
			'price_m2_avg':{'$avg':'$properties.Cena za meter'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df14 = pd.json_normalize(q14)\
    .rename(columns = {'_id.location':'location'})\
    .sort_values('price_m2_avg', ascending = False)
df14 = df14[['location', 'price_m2_avg', 'document_count']]
df14.head(30)

Unnamed: 0,location,price_m2_avg,document_count
33,"Bratislava II, Ružinov",6026.669065,1148
40,"Bratislava I, Staré Mesto",4488.97413,1200
62,"Bratislava III, Nové Mesto, časť Pasienky / Ku...",3798.941176,17
46,"Bratislava III, Nové Mesto, časť Vinohrady",3416.4,10
34,"Bratislava III, Nové Mesto",3096.758092,621
30,"Bratislava II, Ružinov, časť Nivy",3084.115385,52
22,"Bratislava II, Ružinov, časť Štrkovec",3079.833333,19
15,"Bratislava V, Rusovce",3055.9,10
61,"Bratislava II, Ružinov, časť Trnávka",2952.8125,16
18,"Bratislava II, Ružinov, časť Ostredky",2935.857143,10


### in top few locations

In [176]:
df15 = df14.loc[df14['location'].isin(locs),:]
df15

Unnamed: 0,location,price_m2_avg,document_count
33,"Bratislava II, Ružinov",6026.669065,1148
40,"Bratislava I, Staré Mesto",4488.97413,1200
34,"Bratislava III, Nové Mesto",3096.758092,621
51,"Bratislava IV, Dúbravka",2889.341615,330
54,"Bratislava III, Rača",2807.228448,242
20,"Bratislava IV, Lamač",2761.480769,52
45,"Bratislava IV, Karlova Ves",2753.413249,328
59,"Bratislava V, Petržalka",2692.093866,1102
16,"Bratislava IV, Devínska Nová Ves",2339.094891,144
26,"Bratislava V, Jarovce",2320.983871,62


## avg m2 price per location and category

In [177]:
q16 = list(adcollection.aggregate([
	{'$group':{'_id':{'location':'$properties.Lokalita', 'category':'$properties.Kategória'},
			'price_m2_avg':{'$avg':'$properties.Cena za meter'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df16 = pd.json_normalize(q16)\
    .rename(columns = {'_id.location':'location', '_id.category':'category'})\
    .sort_values('price_m2_avg', ascending = False)
df16 = df16[['location', 'category', 'price_m2_avg', 'document_count']]
df16.head(30)

Unnamed: 0,location,category,price_m2_avg,document_count
1,"Bratislava II, Ružinov",1 izbový byt,27079.514493,143
193,"Bratislava III, Nové Mesto, časť Pasienky / Ku...",Garsónka,5550.0,5
158,"Bratislava I, Staré Mesto",2 izbový byt,4826.539241,421
43,"Bratislava I, Staré Mesto",3 izbový byt,4806.64899,417
205,"Bratislava I, Staré Mesto",Garsónka,4641.09375,35
197,"Bratislava II, Ružinov, časť Nivy",Garsónka,4625.0,2
128,"Bratislava III, Nové Mesto",Garsónka,4391.954545,22
179,"Bratislava IV, Dúbravka",5 izbový byt a viac,4269.5,8
30,"Bratislava II, Ružinov",Apartmán,4179.0,1
83,"Bratislava I, Staré Mesto",1 izbový byt,4070.284091,88


In [178]:
df17 = df16.loc[df16['location'].isin(locs),:].pivot(index='location', columns='category', values='price_m2_avg')
df17

category,1 izbový byt,2 izbový byt,3 izbový byt,4 izbový byt,5 izbový byt a viac,Apartmán,Dvojgarsónka,Garsónka,Iný byt,Mezonet
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"Bratislava I, Staré Mesto",4070.284091,4826.539241,4806.64899,3494.536424,3081.146341,3447.428571,,4641.09375,2345.0,3794.375
"Bratislava II, Podunajské Biskupice",2591.421053,2422.770492,2115.380952,1818.0,1351.0,,,3200.0,,
"Bratislava II, Ružinov",27079.514493,3068.99177,2911.728758,2945.980583,2288.0,4179.0,2486.6,3683.560606,2642.0,2814.2
"Bratislava II, Vrakuňa",2482.3,2031.84507,2049.431034,2081.252632,1915.666667,,1358.727273,1701.5,,
"Bratislava III, Nové Mesto",3195.595238,3098.927711,3060.660494,2916.881579,2628.826087,2027.6,,4391.954545,3741.5,2383.0
"Bratislava III, Rača",3065.425,2958.90678,2541.611111,2199.8,2397.0,,,2965.2,,
"Bratislava IV, Devínska Nová Ves",2514.166667,2515.64,2088.288889,2149.909091,,,,2387.428571,,
"Bratislava IV, Dúbravka",2885.785714,2938.770115,2698.387097,2831.346154,4269.5,,3340.0,3189.25,,2360.0
"Bratislava IV, Karlova Ves",3085.596154,3121.231884,2563.629032,2406.711538,2698.85,,,,,
"Bratislava IV, Lamač",2696.2,2898.633333,2371.333333,,,,,2778.0,,


## avg m2 price per state

In [179]:
q18 = list(adcollection.aggregate([
	{'$group':{'_id':{'state':'$properties.Stav'},
			'price_m2_avg':{'$avg':'$properties.Cena za meter'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df18 = pd.json_normalize(q18)\
    .rename(columns = {'_id.state':'state'})\
    .sort_values('price_m2_avg', ascending = False)
df18 = df18[['state', 'price_m2_avg', 'document_count']]
df18

Unnamed: 0,state,price_m2_avg,document_count
2,čiastočná rekonštrukcia,5621.985849,1081
0,,3485.674546,1916
4,vo výstavbe,3390.039867,317
3,kompletná rekonštrukcia,3083.844723,2404
1,pôvodný,2732.155462,485


## avg m2 price per state and category

In [180]:
q19 = list(adcollection.aggregate([
	{'$group':{'_id':{'state':'$properties.Stav', 'category':'$properties.Kategória'},
			'price_m2_avg':{'$avg':'$properties.Cena za meter'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df19 = pd.json_normalize(q19)\
    .rename(columns = {'_id.state':'state', '_id.category':'category'})\
    .sort_values(['state', 'price_m2_avg'], ascending = False)
df19 = df19[['state', 'category', 'price_m2_avg', 'document_count']]
df19 = df19.pivot(index = 'state', columns = 'category', values = 'price_m2_avg')
df19

category,1 izbový byt,2 izbový byt,3 izbový byt,4 izbový byt,5 izbový byt a viac,Apartmán,Dvojgarsónka,Garsónka,Iný byt,Mezonet
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,3123.53913,3543.067893,4103.573626,2922.194444,2637.440476,3315.4,3362.333333,3425.322581,2345.0,3794.375
kompletná rekonštrukcia,3543.907514,3261.699881,2868.325359,2708.190311,2959.0,4180.0,3021.487179,3770.831683,2150.0,2617.444444
pôvodný,3075.333333,2630.316832,2529.605882,2439.849057,2148.5,1828.0,1267.875,3749.883333,,
vo výstavbe,3661.30303,3536.534351,3095.353659,3042.645161,3902.2,2734.0,,3051.0,3741.5,3482.666667
čiastočná rekonštrukcia,33503.308411,2668.200758,2415.258794,2221.664706,2545.285714,,2014.147059,3169.060606,,


## avg m2 price per state and location

In [181]:
q20 = list(adcollection.aggregate([
	{'$group':{'_id':{'location':'$properties.Lokalita', 'state':'$properties.Stav'},
			'price_m2_avg':{'$avg':'$properties.Cena za meter'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df20 = pd.json_normalize(q20)\
    .rename(columns = {'_id.location':'location', '_id.state':'state'})\
    .sort_values('price_m2_avg', ascending = False)
df20 = df20[['location', 'state', 'price_m2_avg', 'document_count']]
df20 = df20.loc[df20['location'].isin(locs),:].pivot(index = 'location', columns = 'state', values = 'price_m2_avg')
df20

state,NaN,kompletná rekonštrukcia,pôvodný,vo výstavbe,čiastočná rekonštrukcia
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Bratislava I, Staré Mesto",6919.876364,3888.585761,3089.454545,3983.3,3123.908257
"Bratislava II, Podunajské Biskupice",2397.142857,2286.405405,2060.625,1880.5,1786.677419
"Bratislava II, Ružinov",3211.9375,2993.0839,3149.666667,3480.401639,27803.623077
"Bratislava II, Vrakuňa",2288.270588,2109.080645,1585.885714,2227.8,1767.72973
"Bratislava III, Nové Mesto",3011.799043,3092.921053,3309.898305,3438.269841,2860.318182
"Bratislava III, Rača",2896.227848,2979.222222,2260.315789,,2801.941176
"Bratislava IV, Devínska Nová Ves",2591.30303,2270.0,2186.6,2551.923077,2194.446809
"Bratislava IV, Dúbravka",3173.15534,2703.362745,2297.115385,4177.269231,2453.169231
"Bratislava IV, Karlova Ves",2996.173333,2757.417391,2738.541667,3526.166667,2516.845361
"Bratislava IV, Lamač",2879.64,2670.05,2535.0,,2611.666667


## avg m2 price per state, location and category

In [182]:
q21 = list(adcollection.aggregate([
	{'$group':{'_id':{'location':'$properties.Lokalita', 'state':'$properties.Stav', 'category':'$properties.Kategória'},
			'price_m2_avg':{'$avg':'$properties.Cena za meter'},
              'document_count':{'$sum':1}}
		
  
  }
	
]))
	
df21 = pd.json_normalize(q21)\
    .rename(columns = {'_id.location':'location', '_id.state':'state', '_id.category':'category'})\
    .sort_values('price_m2_avg', ascending = False)
df21 = df21.loc[df21['location'].isin(locs),:].pivot_table(index = ['location', 'category'], columns = 'state', values = ['price_m2_avg', 'document_count'])
df21.columns = df21.columns.swaplevel(0, 1)
df21.sort_index(axis=1, level=0, inplace=True)
df21

Unnamed: 0_level_0,state,kompletná rekonštrukcia,kompletná rekonštrukcia,pôvodný,pôvodný,vo výstavbe,vo výstavbe,čiastočná rekonštrukcia,čiastočná rekonštrukcia
Unnamed: 0_level_1,Unnamed: 1_level_1,document_count,price_m2_avg,document_count,price_m2_avg,document_count,price_m2_avg,document_count,price_m2_avg
location,category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
"Bratislava I, Staré Mesto",1 izbový byt,45.0,4563.222222,13.0,3319.538462,,,21.0,3888.428571
"Bratislava I, Staré Mesto",2 izbový byt,236.0,4174.846491,23.0,3145.826087,18.0,4320.9375,20.0,3013.5625
"Bratislava I, Staré Mesto",3 izbový byt,239.0,3575.039823,53.0,3081.754717,3.0,2339.0,46.0,3105.044444
"Bratislava I, Staré Mesto",4 izbový byt,77.0,3399.736842,6.0,3013.2,7.0,3514.0,25.0,2653.636364
"Bratislava I, Staré Mesto",5 izbový byt a viac,19.0,3258.076923,5.0,2121.75,,,5.0,2505.0
"Bratislava I, Staré Mesto",Apartmán,1.0,4181.0,,,,,,
"Bratislava I, Staré Mesto",Garsónka,29.0,4588.310345,2.0,3462.0,,,,
"Bratislava II, Podunajské Biskupice",1 izbový byt,3.0,2743.0,2.0,2222.0,,,1.0,2361.0
"Bratislava II, Podunajské Biskupice",2 izbový byt,35.0,2389.647059,4.0,2156.25,,,7.0,2048.714286
"Bratislava II, Podunajské Biskupice",3 izbový byt,29.0,2274.172414,3.0,1708.0,3.0,1852.0,14.0,1805.214286


# TODO

- avg price section ongoing filter out duplicate ads + date filter
- every section add timeseries per week, data and chart
- another section DISTRIBUTION with charts, percentiles overall and as timeseries