This notebook is dedicated to scrapping Instagram data on the basis of a hashtag. It is freely based upon a tutorial [here](https://medium.com/@kseniatikhomirova/scrap-instagram-locations-with-python-d48ba6e56ebc). 

I have developed it with the hashtag "vanlife", since me and my wife have been interested in geography of this phenomenon.

In [2]:
import pandas as pd
import sys

### we do a lot of requests during the scrapping. Some of them with requests package, some of them with urllib
import requests
from urllib.request import urlopen 
from urllib.parse import quote  
from bs4 import BeautifulSoup

# to avoid errors, we sometime use time.sleep(N) before retrying a request
import time
# the input data have typically a json structure
import json
import ast

import datetime as dt
import googlemaps


from concurrent.futures import ThreadPoolExecutor

import sddk
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe

In [3]:
conf = sddk.configure()

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [130]:
# google maps - read the key from sciencedata.
key = sddk.read_file("Google_API_key.txt", "str", conf)
gmaps = googlemaps.Client(key=key)

In [None]:
# further, to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
#PIA_data = gc.open_by_url()

# simple test

In [4]:
# count hashtag instances
hashtag = "vanlife"
url = "https://www.instagram.com/explore/tags/{0}/?__a=1".format(hashtag)
r = requests.get(url)
data = json.loads(r.text)
hashtag_count = data['graphql']['hashtag']['edge_hashtag_to_media']['count']
hastag_data_pages = hashtag_count / 75 # there is 75 instances per page
hashtag_count

7978506

In [5]:
data['graphql']['hashtag']['edge_hashtag_to_media']["edges"][0]

{'node': {'comments_disabled': False,
  '__typename': 'GraphImage',
  'id': '2383571837279292993',
  'edge_media_to_caption': {'edges': [{'node': {'text': 'A veces hay que tomar decisiones aunque duelan, dada la situación me veo obligada a deshacerme de uno de mis sueños, pero el desapego nos hace libres!!! Ya se va mañana, se vendió!!!! #viveelmomento -#desapego #enotromundodiferente #yosoyasiyasiseguirenuncacambiare #vanlife'}}]},
  'shortcode': 'CEUJfURqE5B',
  'edge_media_to_comment': {'count': 0},
  'taken_at_timestamp': 1598363927,
  'dimensions': {'height': 1080, 'width': 1080},
  'display_url': 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/118227097_330033871742552_5789658312245101297_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=108&_nc_ohc=BbwoVG_mlu0AX_k9Lti&oh=b15e574a2a73ce5d54e533020e061274&oe=5F6FE932',
  'edge_liked_by': {'count': 0},
  'edge_media_preview_like': {'count': 0},
  'owner': {'id': '145839008'},
  'thumbnail_src': 'https://scontent-frt

In [None]:
### maximal pages value
int(hastag_data_pages - 1)

67446

# Collecting end_cursors

In [6]:
def request_for_next_page(url): 
    r = requests.get(url)
    try:
        data = json.loads(r.text)
        end_cursor = data['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
    except:
        problem = "problem"
        print(problem)
        n = 0
        while (n <= 3 and problem == "problem"):  
            time.sleep(1)
            try: 
                r = requests.get(url)
                data = json.loads(r.text)
                end_cursor = data['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
                problem = "problem solved"
            except:
                n = n+1
    try:
        return end_cursor
    except:
        return "not-found"

In [9]:
# read the latest end_cursors file

try:
    # if we already have some endcursors data, we will start with them
    all_end_cursors = sorted([fn for fn in sddk.list_filenames("instagram_webscraping", "", conf) if "end_cursors" in fn])
    latest_end_cursors = all_end_cursors[-1]
    end_cursors = sddk.read_file("instagram_webscraping/" + latest_end_cursors, "list", conf)
    print(len(end_cursors))
except:
    # otherwise we will start with an empty list
    end_cursors = []
    print("starting with empty list")

637


In [11]:
%%time
n_of_pages = 500
hashtag = "vanlife"
raw_url =  "https://www.instagram.com/explore/tags/" + hashtag + "/?__a=1"

end_cursor = ""

for n in range(n_of_pages):
    if len(end_cursors) > 0:
        url = raw_url + "&max_id=" + end_cursors[-1] # use the last end cursor
    else:
        url = raw_url
    actual_end_cursor = request_for_next_page(url)
    if actual_end_cursor !="not-found":
        end_cursors.append(actual_end_cursor)# value for the next page
    else:
        break

CPU times: user 15.6 s, sys: 802 ms, total: 16.4 s
Wall time: 18min 32s


In [13]:
# always change name to by increasing the number
#sddk.write_file("instagram_webscraping/end_cursors_3.json", end_cursors, conf)

Your <class 'list'> object has been succefully written as "https://sciencedata.dk/files/instagram_webscraping/end_cursors_2.json"


# Define crucial functions

In [129]:
# simple test of gmaps api
gplace = gmaps.geocode("chotikov")[0]
coordinates = gplace["geometry"]["location"]
g_loc_type = gplace["types"]
coordinates

{'lat': 49.792952, 'lng': 13.3176738}

In [124]:
def mine_the_post(actual_url):
    post_data = json.loads(urlopen(actual_url).read().decode("utf-8"))
    post = {}
    try:
        post["location_slug"] = post_data['graphql']['shortcode_media']['location']['slug']
        address_json = post_data['graphql']['shortcode_media']['location']['address_json'].replace("false", "False").replace("true", "True") 
        address_json = ast.literal_eval(address_json.replace("false", "False").replace("true", "True"))
        for loc_type in ["city", "region", "country"]:
            if address_json['exact_' + loc_type + "_match"] == True:
                post["i_loc_type"] = loc_type
                break

        gplace = gmaps.geocode(post["location_slug"])[0]
        post["coordinates"] = gplace["geometry"]["location"]
        post["g_loc_type"] = gplace["types"]
    except:
        pass
        #post["location_slug" = ""
        #coordinates = None
        #i_loc_type = None
        #g_loc_type = None
    try:
        timestamp = post_data['graphql']['shortcode_media']['taken_at_timestamp']
        post["timestamp"] = dt.datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')
    except: pass 
    return post
    #return [timestamp, location_slug, address_json, i_loc_type, , coordinates, g_loc_type] #, coordinates] #, lat, lon]

def deEmojify(inputString): # from here: https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
  return inputString.encode('ascii', 'ignore').decode('ascii')

def get_post_info(item):
    post = {}
    item_node_shortcode = item["node"]['shortcode']
    post_url = "https://www.instagram.com/p/" + item_node_shortcode + "/?__a=1"
    post["end_cursor"] = end_cursor
    post["url"] = post_url.partition("?__a=1")[0]
    try: 
        text = item['node']['edge_media_to_caption']['edges'][0]['node']['text'].replace("\n", " ")
        post["text"] = deEmojify(text)
    except: text = ""
    hashtags = []
    for word in text.split():
        if word.startswith("#"):
            hashtags.append(word.partition("#")[2])
    post["hashtags"] = hashtags
    try: # produce a list of potential object on the picture
        caption = item['node']['accessibility_caption'].partition("contain: ")[2].split(", ")
        post["caption"] = caption[:-1] + caption[-1].split(" and ")
    except: pass
    post["likes"] = item['node']['edge_liked_by']["count"]
    #basic_data = [end_cursor, post_url.partition("?__a=1")[0], text, hashtags, caption, likes]
    try:
        post.update(mine_the_post(post_url))
    except:
        time.sleep(2)
        try: 
            post.update(mine_the_post(post_url))
        except:
            pass
    return post

def get_edges(url_address):
  try: 
    r = requests.get(url_address)
    data = json.loads(r.text)
    edges = data['graphql']['hashtag']['edge_hashtag_to_media']['edges']
    return edges
  except: 
    try:
      time.sleep(3)
      r = requests.get(url_address)
      data = json.loads(r.text)
      edges = data['graphql']['hashtag']['edge_hashtag_to_media']['edges'] # list with posts
      return edges
    except:
      return "no edges"


# Start with preprocessed end_cursor data

In [127]:
actual_data = []
n = 1
tag = 'vanlife' # your tag
for end_cursor in end_cursors[:3]:
    url = "https://www.instagram.com/explore/tags/{0}/?__a=1&max_id={1}".format(tag, end_cursor)
    edges = get_edges(url)
    if edges != "no edges":
      with ThreadPoolExecutor(max_workers=75) as pool:
        current_parsed_edges = list(pool.map(get_post_info,edges))
      actual_data.extend(current_parsed_edges)
      print(end_cursor)
      if len(actual_data) >= 5000:
        actual_data_df = pd.DataFrame(actual_data)
        sddk.write_file("instagram_webscraping/posts_raw_" + str(n) + ".json", actual_data_df, conf)
        n = n+1
        actual_data = [] # clear actual data
        
### export our last data as well 
actual_data_df = pd.DataFrame(actual_data)
sddk.write_file("instagram_webscraping/posts_raw_" + str(n) + ".json", actual_data_df, conf)

QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNlJJU2p0cG8zUnNqa2pyemsyMU01M1h0dEdhNEUtN09hdG5JcFpBRTc3Y0FGVGEwMUR4b2Z4ag==
QVFCeWdGSzdzSmRMU25KenJJWXM1X3dUdmM3WXJOQ3hybG1SdE8zVVA4VnlQRXVFYThBQUZQVFVuUUF6dm5vWE0tcVY4UmNrdGRzeTNVeGFEUFRTT1VXcw==
QVFEbmRaeUdsNTZvR1BJX1BWYWdTM3JGRy1PMGNzSnk3dElmWmR6cy1HVG1TRGc3VFZXcU1VOE44ZGxUYk5SdFBuSzlUbWRBQTBwTGFrQ2E1U0lNQy1jVA==


In [128]:
actual_data_df

Unnamed: 0,end_cursor,url,text,hashtags,caption,likes,timestamp,location_slug,coordinates,g_loc_type,i_loc_type
0,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwOAgILJK/,Lugares de descanso y pernocta que marcan la d...,"[diasentremontañas, anayruben, subetealpaisaje...","[mountain, sky, outdoor, nature.]",39,2020-08-25 10:17:58,,,,
1,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwMNxJWG6/,Pippa's first holibob...and our first camping ...,"[lakedistrict, t5, camping, campervan, hoilday...","[mountain, sky, outdoor, nature.]",30,2020-08-25 10:17:43,lake-district,"{'lat': 35.2240721, 'lng': -89.73193669999999}","[establishment, point_of_interest, shopping_mall]",
2,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwJWGpuv9/,#vanlife #camper #vwt #camping #homeiswhereyou...,"[vanlife, camper, vwt, camping, homeiswhereyou...",[indoor.],64,2020-08-25 10:17:19,los-angeles-california,"{'lat': 34.0522342, 'lng': -118.2436849}","[locality, political]",city
3,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwJONA_9T/,Back to work today and missing the sea views b...,"[vanlife, vanlifediaries, vanlifeuk, t4, vwt4,...","[one or more people, ocean, sky, cloud, outdoo...",52,2020-08-25 10:17:18,wales,"{'lat': 52.1306607, 'lng': -3.7837117}","[administrative_area_level_1, political]",region
4,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwIInpCsg/,Being back in Norway meant shitty weather and ...,"[norway, hike, waterfall, lake, travel, view, ...","[mountain, sky, outdoor, nature.]",19,2020-08-25 10:17:09,rago-national-park,"{'lat': 67.4385104, 'lng': 16.0056756}","[establishment, park, point_of_interest, touri...",
...,...,...,...,...,...,...,...,...,...,...,...
207,QVFEbmRaeUdsNTZvR1BJX1BWYWdTM3JGRy1PMGNzSnk3dE...,https://www.instagram.com/p/CETr1iHDWOu/,Me par a mirar entre los rboles y la vi. Tan t...,[],"[tree, outdoor, nature.]",22,2020-08-25 09:39:40,,,,
208,QVFEbmRaeUdsNTZvR1BJX1BWYWdTM3JGRy1PMGNzSnk3dE...,https://www.instagram.com/p/CETkxEoJ5Ts/,What do I do with those spots? I am going to c...,"[vanlifegermany, vanlife, vanlifepoland, vanli...",[],8,2020-08-25 08:37:53,,,,
209,QVFEbmRaeUdsNTZvR1BJX1BWYWdTM3JGRy1PMGNzSnk3dE...,https://www.instagram.com/p/CETiOHHB9a7/,"What is she doing under there? It cant be fun,...",[],"[shoes, outdoor.]",16,2020-08-25 08:15:38,,,,
210,QVFEbmRaeUdsNTZvR1BJX1BWYWdTM3JGRy1PMGNzSnk3dE...,https://www.instagram.com/p/CC_qDrMHte8/,Stick and Poke Skull - - - - - - #tattoo #tatt...,"[tattoo, tattoos, skull, skulltattoo, stickand...",[],135,2020-07-23 18:27:55,gangelt,"{'lat': 50.9927163, 'lng': 5.997431}","[locality, political]",city


# To begin with parsed data

In [83]:
data_parsed_df = sddk.read_file("instagram_webscraping/posts_raw_1.json", "df", conf)
data_parsed_df.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwOAgILJK/,Lugares de descanso y pernocta que marcan la d...,"[diasentremontaas, anayruben, subetealpaisaje,...","[mountain, sky, outdoor, nature.]",30,2020-08-25 10:17:58,,,,"[, ]"
1,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwMNxJWG6/,Pippa's first holibob...and our first camping ...,"[lakedistrict, t5, camping, campervan, hoilday...","[mountain, sky, outdoor, nature.]",23,2020-08-25 10:17:43,GB,"{'id': '113392233373458', 'has_public_page': T...",,"[, ]"
2,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwJWGpuv9/,#vanlife #camper #vwt #camping #homeiswhereyou...,"[vanlife, camper, vwt, camping, homeiswhereyou...",[indoor.],55,2020-08-25 10:17:19,US,"{'id': '212999109', 'has_public_page': True, '...",,"[, ]"
3,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwJONA_9T/,Back to work today and missing the sea views b...,"[vanlife, vanlifediaries, vanlifeuk, t4, vwt4,...","[one or more people, ocean, sky, cloud, outdoo...",36,2020-08-25 10:17:18,GB,"{'id': '258199373', 'has_public_page': True, '...",,"[, ]"
4,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwIInpCsg/,Being back in Norway meant shitty weather and ...,"[norway, hike, waterfall, lake, travel, view, ...","[mountain, sky, outdoor, nature.]",14,2020-08-25 10:17:09,,"{'id': '362528469', 'has_public_page': True, '...",,"[, ]"
5,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwHu_jlIo/,Visite de Rocamadour au top !! #rocamadour #d...,"[rocamadour, decouverte, campingcar, campingca...","[cloud, sky, outdoor.]",13,2020-08-25 10:17:06,FR,"{'id': '250378211', 'has_public_page': True, '...",,"[, ]"
6,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwGeMjO7O/,Golden hour is our happy hour This great cam...,"[InsiderTip, jaycoaustralia, camping, roadtrip...","[sky, outdoor, nature.]",18,2020-08-25 10:16:56,AU,"{'id': '236913728', 'has_public_page': True, '...",,"[, ]"
7,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwGUxJKif/,Deutschland-Tour Tag 5-7: Am Abend des fnften...,"[mecklenburgvorpommern, mecklenburgischeseenpl...","[food, indoor.]",60,2020-08-25 10:16:55,,,,"[, ]"
8,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwEV4pa-a/,A small sign of life from us - were fine . #c...,"[campershower, outdoorshower, campervan, campe...","[tree, outdoor, nature.]",111,2020-08-25 10:16:38,,,,"[, ]"
9,QVFCT1dLZGtyQksyekRPUlJ3LVlaTFA0WFdUQ010OE9yNl...,https://www.instagram.com/p/CETwCCdoOLr/,"()Wir sind nicht die Norm, wir sind die die an...","[frmehrrealittaufinstagram, formorerealityonin...","[sky, cloud, ocean, outdoor, water, nature.]",49,2020-08-25 10:16:19,NL,"{'id': '248392736', 'has_public_page': True, '...",,"[, ]"
