In [1]:
%matplotlib inline
import dataset
import pandas as pd

In [68]:
# Get data
db_url = "sqlite:///data/tripadvisor.db"
db = dataset.connect(db_url)
attractions = db["attractions"]

df = pd.DataFrame([row for row in attractions.all()])

In [69]:
# Add county where missing
gotland_places = [u'Dalhem', u'Tofta', u'Visby', u'Fårö', u'Gotland', u'Fårösund', u'Klintehamn', u'Burgsvik', u'Rydal', u'Roma',
       u'Tingst\xe4de', u'Västergarn']
kalmar_places = [u'Öland', u'Byxelkrok', u'Borgholm', u'Färjestaden', u'Löttorp', u'Mörbylånga']
vasternorrland_places = [u'Örnsköldsvik',u'Ulvöhamn']

df.loc[df.city.isin(gotland_places), 'county'] = 'Gotland'
df.loc[df.city.isin(kalmar_places), 'county'] = u'Kalmar län'
df.loc[df.city.isin(vasternorrland_places), 'county'] = u'Västernorrlands län'


In [70]:
# Cleaning

# Remove arrow from county name
df['county'] = df.loc[:, 'county'].str.replace(u"›","")

In [58]:
def get_score(x):
    score = 0
    for points, perc in enumerate(x):
        score += (points + 1) * perc
    return score

In [71]:
grade_cols = [u'Dåligt', u'Hemskt', u'Medelmåttigt', u'Mycket bra', u'Utmärkt']
grade_cols_perc = ["%s (%%)" % x for x in grade_cols]

# Count percentages
_df = df[grade_cols].apply(lambda x: x / x.sum(), axis=1)
_df.columns = grade_cols_perc
df = df.join(_df)

# Count score
df[u"Poäng"] = df[grade_cols_perc].apply(get_score, axis=1)

# Add column with total number of reviews
df["Antal recensioner"] = df[grade_cols].sum(axis=1)

# Remove attractions with few reviews
LIMIT = 10
df_filt = df[df["Antal recensioner"] > LIMIT]

print "Removed attractions with fewer than %s reviews. From %s to %s rows." % (LIMIT, len(df), len(df_filt))

Removed attractions with fewer than 10 reviews. From 2033 to 470 rows.


### Vilket län har hetast attraktioner?

När vi räknar ut ett medelpoäng för alla attraktioner som fått minst tio recensioner ser topplistan ut så här.

Unnamed: 0_level_0,Dåligt (%),Hemskt (%),Medelmåttigt (%),Mycket bra (%),Utmärkt (%),Antal attraktioner,Poäng
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bohuslän,0.0,0.0,0.0,0.2,0.8,1,4.8
Västerbottens län,0.002119,0.0,0.094807,0.386952,0.516122,8,4.414959
Jönköpings län,0.032341,0.0,0.069434,0.33815,0.560075,13,4.393617
Värmlands län,0.002439,0.016333,0.120607,0.335422,0.525199,10,4.364608
Västmanlands län,0.014457,0.010973,0.103806,0.373899,0.496865,11,4.32774
Östergötlands län,0.014263,0.006187,0.100621,0.397787,0.481142,26,4.325358
Gotland,0.011771,0.012784,0.10397,0.38526,0.486215,12,4.321364
Gävleborgs län,0.02044,0.022222,0.082525,0.3679,0.506912,12,4.318623
Norrbottens län,0.023713,0.021157,0.097905,0.35528,0.501945,17,4.290588
Blekinge län,0.013051,0.007407,0.077916,0.50443,0.397195,9,4.26531


Vi kan strunta i Bohuslän som bara har en attraktion med här. Då är **Västerbotten**, **Jönköping** och **Värmland** de län vars besöksmål får högst betyg på Tripadvisor.

Vilka attraktioner är då det?

In [78]:
columns_to_include = ["city","name", u"Poäng", "Antal recensioner"]
df_filt[df_filt.county == u"Västerbottens län"][columns_to_include].sort_values(u"Poäng", ascending=False).head()

Unnamed: 0,city,name,Poäng,Antal recensioner
1734,Vindeln,Aurora Borealis Adventures,5.0,13
958,Umeå,Guitars the Museum,4.576271,59
960,Umeå,Bildmuseet,4.516129,31
31,Markaryd,Smålandet Markaryds Älgsafari,4.5,16
961,Umeå,Umedalen Skulptur,4.428571,21


In [79]:
df_filt[df_filt.county == u"Jönköpings län"][columns_to_include].sort_values(u"Poäng", ascending=False).head()

Unnamed: 0,city,name,Poäng,Antal recensioner
877,Jönköping,Tur & Natur - Nystedt Husky,4.954545,22
2025,Ramkvilla,Ramoa Adventure Village,4.92,50
1064,Gränna,Polkapojkarna,4.911765,34
879,Jönköping,Habo Kyrka,4.826087,23
1470,Eksjö,Naturreservatet Skurugata & Skuruhatt,4.583333,12


In [80]:
df_filt[df_filt.county == u"Värmlands län"][columns_to_include].sort_values(u"Poäng", ascending=False).head()

Unnamed: 0,city,name,Poäng,Antal recensioner
850,Karlsborg,Tivedens National park,4.846154,13
1530,Ekshärad,Värmlands Moose Park,4.833333,18
780,Karlstad,Sandgrund Lars Lerin,4.54878,82
851,Karlsborg,Karlsborgs Fästning,4.47619,21
779,Karlstad,Mariebergsskogen,4.454545,66


## Vilken stad har hetast attraktioner?

Vi räknar medelbetyget på attrakioner i varje stad och inkluderar bara städer som har minst fem attraktioner

**Visby**, **Lund** och **Borås** i topp!

In [84]:
# Get mean reviews
avg_reviews = df_filt.groupby("city").mean()[grade_cols_perc]

# Get scores
avg_reviews[u"Poäng"] = avg_reviews[grade_cols_perc].apply(get_score, axis=1)

# Get number of attraktions
avg_reviews["Antal attraktioner"] = df_filt.groupby("city").count()["id"]

# Filter cities with few attractions
avg_reviews[avg_reviews["Antal attraktioner"] > 5].sort_values(u"Poäng", ascending=False)

Unnamed: 0_level_0,Dåligt (%),Hemskt (%),Medelmåttigt (%),Mycket bra (%),Utmärkt (%),Poäng,Antal attraktioner
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Visby,0.006293,0.0,0.100162,0.379658,0.513887,4.394846,8
Lund,0.03038,0.0,0.071328,0.414394,0.483899,4.321431,6
Borås,0.044901,0.0,0.082538,0.339397,0.533164,4.315923,7
Jönköping,0.051282,0.0,0.079362,0.346107,0.523249,4.290041,6
Norrköping,0.017292,0.0,0.122884,0.403515,0.456309,4.28155,9
Sundsvall,0.033699,0.006944,0.089116,0.418141,0.4521,4.247997,6
Västerås,0.01683,0.015088,0.135055,0.394418,0.438609,4.222888,8
Växjö,0.015152,0.013889,0.162227,0.35323,0.455503,4.220044,6
Gävle,0.03066,0.033333,0.097154,0.414421,0.424431,4.168631,8
Uppsala,0.023148,0.007407,0.154042,0.437016,0.378386,4.140085,10


## Vilken typ av attraktioner får högst betyg?

In [125]:
# Aggregate scores by tag
tags = {}
        
for row in df[["tags", u"Poäng"]].iterrows():
    score = row[1][1]
    for tag in row[1][0].split("|"):
        if tag not in tags:
            tags[tag] = []
        # Append score
        tags[tag].append(score)


In [137]:
# Create a dataframe with mean score by tag
by_tag = pd.DataFrame({
    "Medelbetyg": [pd.Series(x).mean() for x in tags.values()],
    "Antal aktiviteter": [len(x) for x in tags.values()],
}, index=tags.keys())


Vi räknar ut ett medelbetyg för alla aktivitetstyper som förekommer minst fem gånger. **Aktivitet och action** får höga poäng. **Forntida ruiner** är ingen succé. 

In [139]:
by_tag[by_tag["Antal aktiviteter"] > 5].sort_values("Medelbetyg", ascending=False)

Unnamed: 0,Antal aktiviteter,Medelbetyg
Adrenalin- och extremturer,9,4.990909
Cykelturer,12,4.982078
Lektioner och seminarier,6,4.966667
Privata rundturer,8,4.884354
Ekoturer,7,4.874459
Kurser och seminarier,16,4.836797
Terrängfordonsturer,9,4.833974
Zipline och höghöjdsparker,12,4.829630
Skogar,16,4.812500
Vandrings- och campingturer,6,4.791667
