In [11]:
import pandas as pd

import requests
from urllib.request import urlopen 
from urllib.parse import quote  

import time
import json

import datetime as dt

from concurrent.futures import ThreadPoolExecutor

import nltk

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe

import ast

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
plotly.offline.init_notebook_mode(connected=True)

import sddk

In [None]:
### to use plotly within browser notebook

#def configure_plotly_browser_state():
    import IPython
    display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

In [3]:
# spatial packages
import googlemaps
import shapely # geometries, e.g. "POINT (23931.23 45492.17)"
import geopandas # pandas DataFrame + geometry
import contextily as ctx # basemaps

In [4]:
conf = sddk.configure()

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [5]:
key = sddk.read_file("Google_API_key.txt", "str", conf)
gmaps = googlemaps.Client(key=key)
### authorize google sheets

In [12]:
testing_vanlife_df = sddk.read_file("instagram_webscraping/posts_raw_1.json", "df", conf)
testing_vanlife_df.head(2)

Unnamed: 0,end_cursor,url,text,hashtags,caption,likes,timestamp,location_slug,country_code,coordinates,g_loc_type,i_loc_type
0,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwOAgILJK/,Lugares de descanso y pernocta que marcan la d...,"[diasentremontañas, anayruben, subetealpaisaje...","[mountain, sky, outdoor, nature.]",39,2020-08-25 10:17:58,,,,,
1,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwMNxJWG6/,Pippa's first holibob...and our first camping ...,"[lakedistrict, t5, camping, campervan, hoilday...","[mountain, sky, outdoor, nature.]",31,2020-08-25 10:17:43,lake-district,GB,"{'lat': 35.225172, 'lng': -89.7312158}","[establishment, point_of_interest, shopping_mall]",


In [13]:
len(testing_vanlife_df)

355

In [15]:
# to use 3-letter version of country codes
# we have to upload some normalized country codes data

country_codes_variants = pd.read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")
country_codes_variants.set_index("alpha-2", inplace=True)
country_codes_variants.head(2)

Unnamed: 0_level_0,name,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
alpha-2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AF,Afghanistan,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
AX,Åland Islands,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,


In [16]:
def column_returner(abbr, column):
    try: return country_codes_variants.at[abbr, column]
    except: return abbr

testing_vanlife_df["country_code_new"] = testing_vanlife_df.apply(lambda row: column_returner(row["country_code"], "alpha-3"), axis=1)
testing_vanlife_df["country"] = testing_vanlife_df.apply(lambda row: column_returner(row["country_code"], "name"), axis=1)
testing_vanlife_df["region"] = testing_vanlife_df.apply(lambda row: column_returner(row["country_code"], "region"), axis=1)
testing_vanlife_df["subregion"] = testing_vanlife_df.apply(lambda row: column_returner(row["country_code"], "sub-region"), axis=1)

testing_vanlife_df.head(2)

Unnamed: 0,end_cursor,url,text,hashtags,caption,likes,timestamp,location_slug,country_code,coordinates,g_loc_type,i_loc_type,country_code_new,country,region,subregion
0,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwOAgILJK/,Lugares de descanso y pernocta que marcan la d...,"[diasentremontañas, anayruben, subetealpaisaje...","[mountain, sky, outdoor, nature.]",39,2020-08-25 10:17:58,,,,,,,,,
1,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwMNxJWG6/,Pippa's first holibob...and our first camping ...,"[lakedistrict, t5, camping, campervan, hoilday...","[mountain, sky, outdoor, nature.]",31,2020-08-25 10:17:43,lake-district,GB,"{'lat': 35.225172, 'lng': -89.7312158}","[establishment, point_of_interest, shopping_mall]",,GBR,United Kingdom of Great Britain and Northern I...,Europe,Northern Europe


In [17]:
def get_most_popular_hashtags(input_dataframe, column, value, number):
    actual_df = testing_vanlife_df[input_dataframe[column]==value]
    hashtags_list = []
    hashtags_list_of_lists = actual_df["hashtags"].tolist()
    for element in hashtags_list_of_lists:
        hashtags_list.extend(ast.literal_eval(element))
    most_frequent = nltk.FreqDist(hashtags_list).most_common(number + 1)
    most_frequent = ["#" + element[0] + " (N=" + str(element[1]) + ")" for element in most_frequent]
    return most_frequent[1:]

In [18]:
testing_vanlife_df["region"].unique()

array([None, 'Europe', 'Americas', '', 'Oceania', 'Asia', 'Africa'],
      dtype=object)

In [19]:
testing_vanlife_df["subregion"].unique()

array([None, 'Northern Europe', 'Northern America', '', 'Western Europe',
       'Australia and New Zealand', 'Southern Europe', 'Eastern Europe',
       'Eastern Asia', 'Southern Asia', 'Western Asia', 'Central Asia',
       'Sub-Saharan Africa', 'South-eastern Asia'], dtype=object)

In [21]:
hashtags_subregion = []
for element in testing_vanlife_df["subregion"].unique():
    hashtags_subregion.append([element] + get_most_popular_hashtags(testing_vanlife_df, "subregion", element, 5))

hashtags_subregion_df = pd.DataFrame(hashtags_subregion)
hashtags_subregion_df.set_index(0, inplace=True)
hashtags_subregion_df

ValueError: malformed node or string: ['lakedistrict', 't5', 'camping', 'campervan', 'hoilday', 'vanlife', 'travel', 'travelgram', 'cockerpoosofinstagram', 'cockapooclub', 'puppyoftheday', 'cockapoo', 'cockerpoolife', 'cockapoocorner', 'instadogs', 'instapuppy']

In [0]:
hashtags_region = []
for element in testing_vanlife_df["region"].unique():
  hashtags_region.append([element] + get_most_popular_hashtags(testing_vanlife_df, "region", element, 5))

hashtags_region_df = pd.DataFrame(hashtags_region)
hashtags_region_df.set_index(0, inplace=True)
hashtags_region_df

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Europe,#campervan (N=4145),#travel (N=3783),#roadtrip (N=3267),#homeiswhereyouparkit (N=2953),#vanlifediaries (N=2581)
Americas,#travel (N=2008),#roadtrip (N=1647),#vanlifediaries (N=1379),#homeiswhereyouparkit (N=1349),#campervan (N=1199)
Asia,# (N=706),#travel (N=370),#camping (N=307),#nature (N=225),#adventure (N=177)
Africa,#travel (N=755),#roadtrip (N=700),#campervan (N=590),#homeiswhereyouparkit (N=485),#nature (N=465)
Oceania,#travel (N=1217),#roadtrip (N=1078),#australia (N=1000),#newzealand (N=807),#adventure (N=494)
XK,#mountains (N=2),#travel (N=2),#travelphotography (N=2),#photography (N=2),#goodbyedeutschland (N=1)


In [0]:
testing_vanlife_df.count()

end_cursor         41208
post_url           41208
text               40965
hashtags           41208
picture_content    39237
likes              41208
timestamp          41208
country_code       38191
location           41208
lat                41208
lon                41208
dtype: int64

In [0]:
# to use 3-letter version of country codes
# we have to upload some normalized country codes data

country_codes_variants = pd.read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")
country_codes_variants.set_index("alpha-2", inplace=True)
country_codes_variants.head(2)

In [0]:
hashtags_list = []
hashtags_list_of_lists = testing_vanlife_df["hashtags"].tolist()
for element in hashtags_list_of_lists:
  hashtags_list.extend(ast.literal_eval(element))
print(hashtags_list[:100])

['ryobitools', 'vanproject', 'vanlifecommunity', 'camperconversion', 'vanlife', 'vanlifediaries', 'vanlifedreams', 'vanlifeexplorers', 'vanlifer', 'vanlifesociety', 'vanlifemagazine', 'vanlifecaptain', 'vanlifestyle', 'vanlifeuk', 'vanlifecommunity', 'vanlifewithkids', 'camperconversion', 'camperconversions', 'campervan', 'campervanconversion', 'campervanlife', 'campervanbuild', 'fordcustom', 'campervaninterior', 'campervaninteriors', 'campervaningwithkids', 'france', 'frankrijk', 'visitlafrance', 'visitfrance', 'france', 'europa', 'europe', 'roadtrip', 'camperlife', 'vanlife', 'nature', 'hiking', 'wandelen', 'camping', 'kamperen', 'travel', 'reizen', 'trail', 'vwt5', 'vwcaliforniat5', 'vwcalifornia', 'exploringeurope', 'streetphotography', 'architecture', 'rijssel', 'lille#france', 'frankrijk', 'visitlafrance', 'visitfrance', 'france', 'europa', 'europe', 'roadtrip', 'camperlife', 'vanlife', 'nature', 'hiking', 'wandelen', 'camping', 'kamperen', 'travel', 'reizen', 'trail', 'vwt5', 'v

In [0]:
nltk.FreqDist(hashtags_list).most_common(30)

[('vanlife', 31475),
 ('travel', 8135),
 ('roadtrip', 6859),
 ('campervan', 6440),
 ('homeiswhereyouparkit', 5211),
 ('vanlifediaries', 4917),
 ('adventure', 4136),
 ('camping', 3979),
 ('nature', 3662),
 ('wanderlust', 3630),
 ('van', 3531),
 ('camper', 3289),
 ('vanlifers', 3039),
 ('explore', 2718),
 ('homeonwheels', 2636),
 ('vanlifeexplorers', 2542),
 ('', 2502),
 ('camperlife', 2449),
 ('travelphotography', 2245),
 ('vw', 2032),
 ('vanlifemovement', 1982),
 ('photography', 1917),
 ('vanlifestyle', 1857),
 ('vanconversion', 1699),
 ('projectvanlife', 1694),
 ('travelgram', 1687),
 ('ontheroad', 1621),
 ('vanlifeeurope', 1574),
 ('love', 1567),
 ('vwbus', 1437)]

In [0]:
testing_vanlife_short = testing_vanlife_df[:10000]

In [0]:
### very slow and does not enable to visualize and analyse all the data at once...

#plotly.offline.init_notebook_mode(connected=True)

configure_plotly_browser_state() # to do it inline

data = [go.Scattergeo(
    lat = testing_vanlife_short["lat"],
    lon = testing_vanlife_short["lon"],
    text = testing_vanlife_short["text"],
    showlegend = False,
    marker = dict(
        opacity = 0.7,
        size = 2,                       
    )
)]

layout = dict(
    geo = dict(
        showland = True,
        landcolor = "rgb(212, 212, 212)",
        #subunitcolor = "rgb(255, 255, 255)",
        #countrycolor = "rgb(255, 255, 255)",
        showframe = False,
        showlakes = True,
        lakecolor = "rgb(255, 255, 255)",
        #showsubunits = True,
        showcountries = True,
        resolution = 110,
        projection = dict(
            type = 'equirectangular'
          )
   
    ),
    showlegend = False,
    title = 'Looking at the instagram hashtag "#vanlife"'
)

fig = go.Figure(data=data, layout=layout )
iplot(fig, filename='hashtags.html') # to make it inline
#plot(fig, filename='hashtags.html')

In [0]:
help(plotly.offline.plot)

Help on function plot in module plotly.offline.offline:

plot(figure_or_data, show_link=False, link_text='Export to plot.ly', validate=True, output_type='file', include_plotlyjs=True, filename='temp-plot.html', auto_open=True, image=None, image_filename='plot_image', image_width=800, image_height=600, config=None, include_mathjax=False)
    Create a plotly graph locally as an HTML document or string.
    
    Example:
    ```
    from plotly.offline import plot
    import plotly.graph_objs as go
    
    plot([go.Scatter(x=[1, 2, 3], y=[3, 2, 6])], filename='my-graph.html')
    # We can also download an image of the plot by setting the image parameter
    # to the image format we want
    plot([go.Scatter(x=[1, 2, 3], y=[3, 2, 6])], filename='my-graph.html'
         image='jpeg')
    ```
    More examples below.
    
    figure_or_data -- a plotly.graph_objs.Figure or plotly.graph_objs.Data or
                      dict or list that describes a Plotly graph.
                      See h

# Tag Frequencies & Population & GDP

In [0]:
countries_tag_df = pd.DataFrame(testing_vanlife_df.groupby("country_code").size())
countries_tag_df.reset_index(inplace=True)
countries_tag_df.columns = ["country_code", "tag_frequency"]
countries_tag_df.head(2)

Unnamed: 0,country_code,tag_frequency
0,AE,1
1,AL,3


In [0]:
# to use 3-letter version of country codes
# we have to upload some normalized country codes data

country_codes_variants = pd.read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")
country_codes_variants.set_index("alpha-2", inplace=True)
country_codes_variants.head(2)

Unnamed: 0_level_0,name,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
alpha-2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AF,Afghanistan,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
AX,Åland Islands,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,


In [0]:
world_population_df = pd.read_csv("https://raw.githubusercontent.com/datasets/population/master/data/population.csv")
world_population_df.head(2)
world_population_2016_df = world_population_df[world_population_df["Year"]==2016]
world_population_2016_df.set_index("Country Code", inplace=True)
world_population_2016_df.head(2)

Unnamed: 0_level_0,Country Name,Year,Value
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ARB,Arab World,2016,406452690.0
CSS,Caribbean small states,2016,7245472.0


In [0]:
world_population_2016_df.at["ARB", "Value"]

406452690.0

In [0]:
### add the 3-letters version and country names into our dataset

def three_letters_abbr_returner(abbr):
  try: return country_codes_variants.at[abbr, "alpha-3"]
  except: return abbr

def country_name_returner(abbr):
  try: return country_codes_variants.at[abbr, "name"]
  except: return ""
  
def population_value_returner(abbr):
  try: return world_population_2016_df.at[abbr, "Value"]
  except: return ""
  
def tag_frequency_normalized(row_data):
  try: return float(float(row_data["tag_frequency"]) / float(row_data["population"]))
  except: return 0
  
countries_tag_df["country_code_new"] = countries_tag_df.apply(lambda row: three_letters_abbr_returner(row["country_code"]), axis=1)
countries_tag_df["country"] = countries_tag_df.apply(lambda row: country_name_returner(row["country_code"]), axis=1)
countries_tag_df["population"] = countries_tag_df.apply(lambda row: population_value_returner(row["country_code_new"]), axis=1)
countries_tag_df["tag_frequency_normalized"] = countries_tag_df.apply(lambda row: tag_frequency_normalized(row), axis=1)

countries_tag_df.head(5)

Unnamed: 0,country_code,tag_frequency,country_code_new,country,population,tag_frequency_normalized
0,AE,1,ARE,United Arab Emirates,9269610.0,1.078794e-07
1,AL,3,ALB,Albania,2876100.0,1.043079e-06
2,AM,1,ARM,Armenia,2924820.0,3.419018e-07
3,AR,8,ARG,Argentina,43847400.0,1.824508e-07
4,AT,20,AUT,Austria,8747360.0,2.286405e-06


In [0]:
df.head(2)

Unnamed: 0,Hrapx,Hrapy,Lat,Lon,Globvalue
0,272.3333,670.25,48.4113,-112.8352,0.0875
1,1546.5,195.1667,18.0057,-65.804,0.0892


In [0]:
configure_plotly_browser_state()

# based on this: https://plot.ly/python/choropleth-maps/

data = [go.Choropleth(
    locations = countries_tag_df['country_code_new'],
    z = countries_tag_df["tag_frequency"],
    text = countries_tag_df['country'],
    colorscale = [
        [0, "rgb(5, 10, 172)"],
        [0.35, "rgb(40, 60, 190)"],
        [0.5, "rgb(70, 100, 245)"],
        [0.6, "rgb(90, 120, 245)"],
        [0.7, "rgb(106, 137, 247)"],
        [1, "rgb(220, 220, 220)"]
    ],
    autocolorscale = True,
    reversescale = False,
    marker = go.choropleth.Marker(
        line = go.choropleth.marker.Line(
            color = 'rgb(180,180,180)',
            width = 0.5
        )),
    colorbar = go.choropleth.ColorBar(
        tickprefix = 'N=',
        title = 'posts'),
)]

layout = go.Layout(
    title = go.layout.Title(
        text = '#vanlife on May 6'
    ),
    geo = go.layout.Geo(
        showland = True,
        landcolor = "rgb(212, 212, 212)",
        showframe = False,
        showcoastlines = True,
        projection = go.layout.geo.Projection(
            type = 'equirectangular'
        )
    ),
    annotations = [go.layout.Annotation(
        x = 0.55,
        y = 0.1,
        xref = 'paper',
        yref = 'paper',
        text = 'Data mined from Instagram',
        showarrow = False
    )]
)

fig = go.Figure(data = data, layout = layout)
iplot(fig, filename = 'd3-world-map')

In [0]:
configure_plotly_browser_state()

# based on this: https://plot.ly/python/choropleth-maps/

data = [go.Choropleth(
    locations = countries_tag_df['country_code_new'],
    z = countries_tag_df["tag_frequency_normalized"],
    text = countries_tag_df['country'],
    colorscale = [
        [0, "rgb(5, 10, 172)"],
        [0.35, "rgb(40, 60, 190)"],
        [0.5, "rgb(70, 100, 245)"],
        [0.6, "rgb(90, 120, 245)"],
        [0.7, "rgb(106, 137, 247)"],
        [1, "rgb(220, 220, 220)"]
    ],
    autocolorscale = True,
    reversescale = False,
    marker = go.choropleth.Marker(
        line = go.choropleth.marker.Line(
            color = 'rgb(180,180,180)',
            width = 0.5
        )),
    colorbar = go.choropleth.ColorBar(
        tickprefix = 'N=',
        title = 'posts'),
)]

layout = go.Layout(
    title = go.layout.Title(
        text = '#vanlife on May 6'
    ),
    geo = go.layout.Geo(
        showframe = False,
        showcoastlines = False,
        projection = go.layout.geo.Projection(
            type = 'equirectangular'
        )
    ),
    annotations = [go.layout.Annotation(
        x = 0.55,
        y = 0.1,
        xref = 'paper',
        yref = 'paper',
        text = 'Data mined from Instagram',
        showarrow = False
    )]
)

fig = go.Figure(data = data, layout = layout)
iplot(fig, filename = 'd3-world-map')

# Population & GDP

In [0]:
world_population_df = pd.read_csv("https://raw.githubusercontent.com/datasets/population/master/data/population.csv")
world_population_df.head(2)
world_population_2016_df = world_population_df[world_population_df["Year"]==2016]
world_population_2016_df.set_index("Country Code", inplace=True)
world_population_2016_df.head(2)

Unnamed: 0_level_0,Country Name,Year,Value
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ARB,Arab World,2016,406452690.0
CSS,Caribbean small states,2016,7245472.0


In [0]:
world_gdp_df = pd.read_csv("https://raw.githubusercontent.com/datasets/gdp/master/data/gdp.csv")
world_gdp_df.head(2)

Unnamed: 0,Country Name,Country Code,Year,Value
0,Arab World,ARB,1968,25760680000.0
1,Arab World,ARB,1969,28434200000.0


In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
df.head(2)

Unnamed: 0,COUNTRY,GDP (BILLIONS),CODE
0,Afghanistan,21.71,AFG
1,Albania,13.4,ALB


In [0]:
configure_plotly_browser_state()

data = [go.Choropleth(
    locations = df['CODE'],
    z = df['GDP (BILLIONS)'],
    text = df['COUNTRY'],
    colorscale = [
        [0, "rgb(5, 10, 172)"],
        [0.35, "rgb(40, 60, 190)"],
        [0.5, "rgb(70, 100, 245)"],
        [0.6, "rgb(90, 120, 245)"],
        [0.7, "rgb(106, 137, 247)"],
        [1, "rgb(220, 220, 220)"]
    ],
    autocolorscale = False,
    reversescale = True,
    marker = go.choropleth.Marker(
        line = go.choropleth.marker.Line(
            color = 'rgb(180,180,180)',
            width = 0.5
        )),
    colorbar = go.choropleth.ColorBar(
        tickprefix = '$',
        title = 'GDP<br>Billions US$'),
)]

layout = go.Layout(
    title = go.layout.Title(
        text = '2014 Global GDP'
    ),
    geo = go.layout.Geo(
        showframe = False,
        showcoastlines = False,
        projection = go.layout.geo.Projection(
            type = 'equirectangular'
        )
    ),
    annotations = [go.layout.Annotation(
        x = 0.55,
        y = 0.1,
        xref = 'paper',
        yref = 'paper',
        text = 'Source: <a href="https://www.cia.gov/library/publications/the-world-factbook/fields/2195.html">\
            CIA World Factbook</a>',
        showarrow = False
    )]
)

fig = go.Figure(data = data, layout = layout)
iplot(fig, filename = 'd3-world-map')