In [1]:
import instascrape
from bs4 import BeautifulSoup
from selenium import webdriver
import re
from webdriver_manager.chrome import ChromeDriverManager
import urllib.parse
from datastructures import Location, Site
from dataclasses_serialization.json import JSONSerializer
import time
import json
from tqdm import tqdm
from multiprocessing import Pool, Lock

driver = webdriver.Chrome(ChromeDriverManager().install())
base_url = "https://www.instagram.com"
driver.get(base_url)

cookies = json.loads(open("cookies.json", "r").read())
for cookie in cookies:
    cookie.pop("sameSite")
    driver.add_cookie(cookie)



Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Get LATEST driver version for 90.0.4430
Trying to download new driver from https://chromedriver.storage.googleapis.com/90.0.4430.24/chromedriver_linux64.zip
Driver has been saved in cache [/home/matas/.wdm/drivers/chromedriver/linux64/90.0.4430.24]


In [6]:
driver.get("https://www.instagram.com")

# 1. Get a list of all Locations that Instagram supports

In [5]:
driver.get(urllib.parse.urljoin(base_url, "/explore/locations/US/united-states/"))
soup = BeautifulSoup(driver.page_source)

data = {}

next_page = not None

while next_page is not None:
    next_page = soup.find("a", href=re.compile("\?page="))

    links = soup.find_all("a", href=re.compile("/explore/locations/"))
    for link in links:
        if link.text == "See More":
            continue
        data[link.text] = Location(link.text, link["href"], [])

    if next_page is None:
        break

    print("Getting the next page:", next_page["href"])
    driver.get(urllib.parse.urljoin(base_url, next_page["href"]))
    soup = BeautifulSoup(driver.page_source)
    time.sleep(2)

# Remove last link to more locations
del data["Locations"]

Getting the next page: /explore/locations/US/united-states/?page=2
Getting the next page: /explore/locations/US/united-states/?page=3
Getting the next page: /explore/locations/US/united-states/?page=4
Getting the next page: /explore/locations/US/united-states/?page=5
Getting the next page: /explore/locations/US/united-states/?page=6
Getting the next page: /explore/locations/US/united-states/?page=7
Getting the next page: /explore/locations/US/united-states/?page=8
Getting the next page: /explore/locations/US/united-states/?page=9
Getting the next page: /explore/locations/US/united-states/?page=10
Getting the next page: /explore/locations/US/united-states/?page=11


# Get the Sites from each Location

In [4]:

def scrape_location(key: str) -> None:
    location = data[key]
    print(location.name)

    driver.get(urllib.parse.urljoin(base_url, location.link))
    soup = BeautifulSoup(driver.page_source)

    next_page = not None

    while next_page is not None:
        next_page = soup.find("a", href=re.compile("\?page="))

        links = soup.find_all("a", href=re.compile("/explore/locations/[0-9]*/"))
        for i, link in enumerate(links):
            if i % 4 == 0:
                print("Site:", i)
                
            if link.text == "See More":
                continue

            temp = instascrape.Location(urllib.parse.urljoin(base_url, link["href"]))
            try: 
                temp.scrape()
            except Exception as e:
                print("Failed to scrape", link["href"], e)
                continue

            site = Site(link.text, link["href"], temp.amount_of_posts, temp.longitude, temp.latitude)
            if site not in location.sites:
                location.sites.append(site)
            time.sleep(0.1)
            
        if next_page is None:
            break

        print("Getting the next page:", next_page["href"])
        driver.get(urllib.parse.urljoin(base_url, next_page["href"]))
        soup = BeautifulSoup(driver.page_source)
        time.sleep(2)

    serialize_instagram_data(data, f"checkpoint_{location.name}.json")

In [5]:
# for city in ["New York", "Chicago", "Los Angeles", "Houston", "Boston", "Denver", "Orlando"]:
for city in ["Chicago"]:
    scrape_location(city)

Chicago
Site: 0
Site: 4
Site: 8
Site: 12
Site: 16
Site: 20
Site: 24
Site: 28
Site: 32
Site: 36
Site: 40
Site: 44
Site: 48
Site: 52
Failed to scrape /explore/locations/280362/lou-malnatis/ None is not a valid Instagram page. Please provide a valid argument.
Site: 56
Site: 60
Site: 64
Failed to scrape /explore/locations/31665/chicago-theatre/ None is not a valid Instagram page. Please provide a valid argument.
Site: 68
Site: 72
Failed to scrape /explore/locations/650788757/dylans-candy-bar-chicago/ None is not a valid Instagram page. Please provide a valid argument.
Site: 76
Site: 80
Site: 84
Site: 88
Site: 92
Failed to scrape /explore/locations/433887/rockit-bar-grill/ None is not a valid Instagram page. Please provide a valid argument.
Getting the next page: /explore/locations/c2438177/chicago-united-states/?page=2
Site: 0
Site: 4
Failed to scrape /explore/locations/234338124/kimpton-hotel-palomar-chicago/ None is not a valid Instagram page. Please provide a valid argument.
Site: 8
Sit

In [7]:
serialize_instagram_data(data, "chicago.json")

## Serialize / Deserialize the data

In [7]:
def serialize_instagram_data(data: dict, path: str) -> None:
    with open(path, "w") as file:
        json.dump(JSONSerializer.serialize(data), file)

def deserialize_instagram_data(path: str) -> dict:
    data = json.loads(open(path, "r").read())
    for key, val in data.items():
        data[key] = JSONSerializer.deserialize(Location, data[key])
    return data

In [8]:
data = deserialize_instagram_data("chicago.json")

# Count total posts in Chicago

In [9]:
total_posts = 0
for site in data["Chicago"].sites:
    total_posts += site.total_posts
print("There are", total_posts, "location-tagged Instagram posts in Chicago.")

There are 31525527 location-tagged Instagram posts in Chicago.


# Convert to CSV for ArcGIS Feature Class

In [15]:
with open("data.csv", "w") as file:
    file.write("name,link,total_posts,lon,lat\n")
    for site in data["Chicago"].sites:
        file.write(f'"{site.name}",{site.link},{site.total_posts},{site.lon},{site.lat}\n')

# Quick Visualization

In [56]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.tile_providers import CARTODBPOSITRON, get_provider, OSM, WIKIMEDIA
from bokeh.models import HoverTool, ColumnDataSource
from pyproj import Proj, transform, Transformer


# Prepare data for bokeh
x_coords = []
y_coords = []
names = []
posts = []

transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True)

for site in data["Chicago"].sites:

    if site.lon == 0 and site.lat == 0:  # Skip the sites with no coordinates
        continue

    x, y = transformer.transform(site.lon, site.lat)
    x_coords.append(x)
    y_coords.append(y)
    names.append(site.name)
    posts.append(site.total_posts)

source = ColumnDataSource(data=dict(x=x_coords, y=y_coords, name=names, total_posts=posts))
hover = HoverTool(tooltips=[
    ("(x,y)", "(@x, @y)"),
    ("Name", "@name"),
    ("Total Posts", "@total_posts"),
])

output_notebook()  # Specify that we will be viewing in a Notebook.

tile_provider = get_provider(WIKIMEDIA)  # Set the basemap tile provider
p = figure(x_range=(-9780000, -9745000), y_range=(5130000, 5160000), x_axis_type="mercator", y_axis_type="mercator", plot_height=1000, plot_width=1000)
p.add_tools(hover)
p.add_tile(tile_provider)

p.circle("x", "y", source=source, size=20, color="navy", alpha=0.5)

show(p)