# Topic of Investigation

Social media use in the United States

## Data Source

Instagram's Explore page, which shows location-tagged posts, manually scraped using Selenium.

## Methods

- Scrape Instagram's location-tagged posts with Selenium and calculate the total amount of posts in each city that Instagram's Explore feature supports.

- Visualize the data on a basemap of Chicago using Bokeh

- Compare the ratio of Instagram posts to the population of that city

In [2]:
from datastructures import Location, Site  # my own classes
from dataclasses_serialization.json import JSONSerializer  # allows serialization / deserialization of the classes

import instascrape
from bs4 import BeautifulSoup
from selenium import webdriver
import re
from webdriver_manager.chrome import ChromeDriverManager
import urllib.parse
import time
import json

driver = webdriver.Chrome(ChromeDriverManager().install())
base_url = "https://www.instagram.com"
driver.get(base_url)  # Navigate to Instagram website

# "Login" to Instagram
cookies = json.loads(open("cookies.json", "r").read())
for cookie in cookies:
    cookie.pop("sameSite")
    driver.add_cookie(cookie)



Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Get LATEST driver version for 90.0.4430
Trying to download new driver from https://chromedriver.storage.googleapis.com/90.0.4430.24/chromedriver_linux64.zip
Driver has been saved in cache [/home/matas/.wdm/drivers/chromedriver/linux64/90.0.4430.24]


In [12]:
driver.get("https://www.instagram.com")

# 1. Get a list of all Locations that Instagram supports

In [5]:
driver.get(urllib.parse.urljoin(base_url, "/explore/locations/US/united-states/"))
soup = BeautifulSoup(driver.page_source)

data = {}  # make an empty data dictionary

next_page = not None

while next_page is not None:
    next_page = soup.find("a", href=re.compile("\?page="))  # find the "See More" link

    links = soup.find_all("a", href=re.compile("/explore/locations/"))
    for link in links:
        if link.text == "See More":
            continue
        data[link.text] = Location(link.text, link["href"], [])

    if next_page is None:
        break

    print("Getting the next page:", next_page["href"])
    driver.get(urllib.parse.urljoin(base_url, next_page["href"]))
    soup = BeautifulSoup(driver.page_source)
    time.sleep(2)

# Remove last link to more locations
del data["Locations"]

Getting the next page: /explore/locations/US/united-states/?page=2
Getting the next page: /explore/locations/US/united-states/?page=3
Getting the next page: /explore/locations/US/united-states/?page=4
Getting the next page: /explore/locations/US/united-states/?page=5
Getting the next page: /explore/locations/US/united-states/?page=6
Getting the next page: /explore/locations/US/united-states/?page=7
Getting the next page: /explore/locations/US/united-states/?page=8
Getting the next page: /explore/locations/US/united-states/?page=9
Getting the next page: /explore/locations/US/united-states/?page=10
Getting the next page: /explore/locations/US/united-states/?page=11


# Get the Sites from each Location

In [15]:

def scrape_location(key: str) -> None:
    location = data[key]
    print(location.name)

    driver.get(urllib.parse.urljoin(base_url, location.link))
    soup = BeautifulSoup(driver.page_source)

    next_page = not None

    while next_page is not None:
        next_page = soup.find("a", href=re.compile("\?page="))

        links = soup.find_all("a", href=re.compile("/explore/locations/[0-9]*/"))
        for i, link in enumerate(links):
            if i % 4 == 0:
                print("Site:", i)
                
            if link.text == "See More":
                continue

            temp = instascrape.Location(urllib.parse.urljoin(base_url, link["href"]))
            try: 
                temp.scrape()
            except Exception as e:
                print("Failed to scrape", link["href"], e)
                continue

            site = Site(link.text, link["href"], temp.amount_of_posts, temp.longitude, temp.latitude)
            if site not in location.sites:
                location.sites.append(site)
            time.sleep(0.1)
            
        if next_page is None:
            break

        print("Getting the next page:", next_page["href"])
        driver.get(urllib.parse.urljoin(base_url, next_page["href"]))
        soup = BeautifulSoup(driver.page_source)
        time.sleep(2)

    serialize_instagram_data(data, f"checkpoint_{location.name}.json")

In [16]:
# for city in ["New York", "Chicago", "Los Angeles", "Houston", "Boston", "Denver", "Orlando"]:
for city in ["Houston"]:
    scrape_location(city)

Houston
Site: 0
Failed to scrape /explore/locations/212962809/houston-texas/ Instagram is redirecting you to the login page instead of the page you are trying to scrape. This could be occuring because you made too many requests too quickly or are not logged into Instagram on your machine. Try passing a valid session ID to the scrape method as a cookie to bypass the login requirement
Failed to scrape /explore/locations/1098248/downtown-houston/ Instagram is redirecting you to the login page instead of the page you are trying to scrape. This could be occuring because you made too many requests too quickly or are not logged into Instagram on your machine. Try passing a valid session ID to the scrape method as a cookie to bypass the login requirement
Failed to scrape /explore/locations/7153/revention-music-center/ Instagram is redirecting you to the login page instead of the page you are trying to scrape. This could be occuring because you made too many requests too quickly or are not logg

KeyboardInterrupt: 

In [8]:
serialize_instagram_data(data, "palo_alto.json")

## Serialize / Deserialize the data

In [10]:
def serialize_instagram_data(data: dict, path: str) -> None:
    with open(path, "w") as file:
        json.dump(JSONSerializer.serialize(data), file)

def deserialize_instagram_data(path: str) -> dict:
    data = json.loads(open(path, "r").read())
    for key, val in data.items():
        data[key] = JSONSerializer.deserialize(Location, data[key])
    return data

In [13]:
data = deserialize_instagram_data("palo_alto.json")

# Count total locations

In [25]:
print("There are", len(data), "scrapable Locations")

There are 962 scrapable Locations


# Count total sites

In [23]:
print("We scraped", len(data["Chicago"].sites), "sites")

We scraped 924 sites


# Count total posts in Chicago

In [24]:
total_posts = 0
for site in data["Chicago"].sites:
    total_posts += site.total_posts
print("There are", f"{total_posts:,}", "location-tagged Instagram posts in Chicago.")

There are 31,525,527 location-tagged Instagram posts in Chicago.


# Convert to CSV for ArcGIS Feature Class

In [15]:
with open("data.csv", "w") as file:
    file.write("name,link,total_posts,lon,lat\n")
    for site in data["Chicago"].sites:
        file.write(f'"{site.name}",{site.link},{site.total_posts},{site.lon},{site.lat}\n')

# Quick Visualization

In [15]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.tile_providers import CARTODBPOSITRON, get_provider, OSM, WIKIMEDIA
from bokeh.models import HoverTool, ColumnDataSource
from pyproj import Proj, transform, Transformer

transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True)

In [27]:
# Prepare data for bokeh
x_coords = []
y_coords = []
names = []
posts = []

for site in data["Chicago"].sites:
    if site.lon == 0 and site.lat == 0:  # Skip the sites with no coordinates
        continue

    x, y = transformer.transform(site.lon, site.lat)
    x_coords.append(x)
    y_coords.append(y)
    names.append(site.name)
    posts.append(site.total_posts)

source = ColumnDataSource(data=dict(x=x_coords, y=y_coords, name=names, total_posts=posts))
hover = HoverTool(tooltips=[
    ("(x,y)", "(@x, @y)"),
    ("Name", "@name"),
    ("Total Posts", "@total_posts"),
])

output_notebook()  # Specify that we will be viewing in a Notebook.

tile_provider = get_provider(WIKIMEDIA)  # Set the basemap tile provider
p = figure(x_range=(-9780000, -9745000), y_range=(5130000, 5160000), x_axis_type="mercator", y_axis_type="mercator", plot_height=1000, plot_width=1000)
p.add_tools(hover)
p.add_tile(tile_provider)

p.circle("x", "y", source=source, size=10, color="navy", alpha=0.5)

show(p)

# Change the circle size by the amount of posts

In [28]:
# Prepare data for bokeh
x_coords = []
y_coords = []
names = []
posts = []
circle_sizes = []  # Scale the circle size by the amount of posts

for site in data["Chicago"].sites:

    if site.lon == 0 and site.lat == 0:  # Skip the sites with no coordinates
        continue

    x, y = transformer.transform(site.lon, site.lat)
    x_coords.append(x)
    y_coords.append(y)
    names.append(site.name)
    posts.append(site.total_posts)
    circle_sizes.append(site.total_posts / 10_000)

source = ColumnDataSource(data=dict(x=x_coords, y=y_coords, name=names, total_posts=posts, circle_sizes=circle_sizes))
hover = HoverTool(tooltips=[
    ("(x,y)", "(@x, @y)"),
    ("Name", "@name"),
    ("Total Posts", "@total_posts"),
])

output_notebook()  # Specify that we will be viewing in a Notebook.

tile_provider = get_provider(WIKIMEDIA)  # Set the basemap tile provider
p = figure(x_range=(-9780000, -9745000), y_range=(5130000, 5160000), x_axis_type="mercator", y_axis_type="mercator", plot_height=1000, plot_width=1000)
p.add_tools(hover)
p.add_tile(tile_provider)

p.circle("x", "y", source=source, size="circle_sizes", color="navy", alpha=0.5)

show(p)

# Remove "Chicago, Illinois" from the plot

In [29]:
# Prepare data for bokeh
x_coords = []
y_coords = []
names = []
posts = []
circle_sizes = []  # Scale the circle size by the amount of posts


transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True)

for site in data["Chicago"].sites:

    if site.lon == 0 and site.lat == 0 or site.name == "Chicago, Illinois":  # Skip the sites with no coordinates
        continue

    x, y = transformer.transform(site.lon, site.lat)
    x_coords.append(x)
    y_coords.append(y)
    names.append(site.name)
    posts.append(site.total_posts)
    circle_sizes.append(site.total_posts / 2000)

source = ColumnDataSource(data=dict(x=x_coords, y=y_coords, name=names, total_posts=posts, circle_sizes=circle_sizes))
hover = HoverTool(tooltips=[
    ("(x,y)", "(@x, @y)"),
    ("Name", "@name"),
    ("Total Posts", "@total_posts"),
])

output_notebook()  # Specify that we will be viewing in a Notebook.

tile_provider = get_provider(WIKIMEDIA)  # Set the basemap tile provider
p = figure(x_range=(-9780000, -9745000), y_range=(5130000, 5160000), x_axis_type="mercator", y_axis_type="mercator", plot_height=1000, plot_width=1000)
p.add_tools(hover)
p.add_tile(tile_provider)

p.circle("x", "y", source=source, size="circle_sizes", color="navy", alpha=0.5)

show(p)

# Ratio of Posts to Population

In [38]:
chicago_population = 8_865_000  # as of 2020
print("The ratio of posts to population is", total_posts / chicago_population)

The ratio of posts to population is 3.556179018612521


# Do the same for Palo Alto

In [33]:
# Prepare data for bokeh
x_coords = []
y_coords = []
names = []
posts = []
circle_sizes = []  # Scale the circle size by the amount of posts


transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True)

for site in data["Palo Alto"].sites:

    if site.lon == 0 and site.lat == 0 or site.name == "Palo Alto, California":  # Skip the sites with no coordinates
        continue

    x, y = transformer.transform(site.lon, site.lat)
    x_coords.append(x)
    y_coords.append(y)
    names.append(site.name)
    posts.append(site.total_posts)
    circle_sizes.append(site.total_posts / 500)

source = ColumnDataSource(data=dict(x=x_coords, y=y_coords, name=names, total_posts=posts, circle_sizes=circle_sizes))
hover = HoverTool(tooltips=[
    ("(x,y)", "(@x, @y)"),
    ("Name", "@name"),
    ("Total Posts", "@total_posts"),
])

output_notebook()  # Specify that we will be viewing in a Notebook.

tile_provider = get_provider(WIKIMEDIA)  # Set the basemap tile provider
p = figure(x_range=(-13601459.55, -13595673.83), y_range=(4497807.13, 4503100.55), x_axis_type="mercator", y_axis_type="mercator", plot_height=1000, plot_width=1000)
p.add_tools(hover)
p.add_tile(tile_provider)

p.circle("x", "y", source=source, size="circle_sizes", color="navy", alpha=0.5)

show(p)

In [34]:
palo_alto_posts = 0
for site in data["Palo Alto"].sites:
    palo_alto_posts += site.total_posts

palo_alto_population = 66_573  # as of 2019
print("The ratio of posts to population is", palo_alto_posts / palo_alto_population)

The ratio of posts to population is 14.09885388971505
