In [1]:
import instaloader
import re #regular expressions
import pandas as pd
import unicodedata
import numpy as np
from os import path
from os import listdir
from os.path import isfile, join
import os
import datetime
import warnings
import itertools
import time
from shutil import rmtree

In [2]:
with open('instagram_mobile_user_agent.txt', 'r') as f:
    mobile_user_agent = f.read()
mobile_user_agent

'Mozilla/5.0 (iPhone; CPU iPhone OS 15_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Instagram 248.1.0.13.112 (iPhone11,2; iOS 15_6_1; en_US; en-US; scale=3.00; 1125x2436; 390885564) NW/3'

In [3]:
# Get instance. Use iphone user agent so requests to Instagram look less suspicious
L = instaloader.Instaloader(user_agent = mobile_user_agent)

# Login to Instagram using session file created with instaloader --login in terminal.
# Each person running the code needs to do this themselves. Instagram seems to require being logged in to access most information.
# Don't use an account you care a lot about - it could get banned due to scraping
L.load_session_from_file('kazimiersmith')

Loaded session from C:\Users\kas1112\AppData\Local\Instaloader\session-kazimiersmith.


In [4]:
influencer_list_full = pd.read_csv('list_influencers.csv', encoding = 'utf-8')
#influencer_list_full = pd.read_csv('list_influencers_5.csv', encoding = 'utf-8')

In [5]:
# For the initial regression of engagement on sponsorship, use influencers with
# 50,000 to 200,000 followers
influencer_list = influencer_list_full[(influencer_list_full['num_followers'] > 50000) 
                                       & (influencer_list_full['num_followers'] < 200000)]['username']
#influencer_list = influencer_list_full['username']

In [6]:
def user_to_json(influencer, num_posts, replace_json = True):
    print('Downloading posts from', influencer)
    Profile = instaloader.Profile
    profile = Profile.from_username(L.context, influencer)

    # TODO does get_posts return the most recent posts first?
    posts = profile.get_posts()
    
    for post in itertools.islice(posts, num_posts):
        shortcode = post.shortcode
        path = os.path.join(os.path.join(os.getcwd(), 'json'), shortcode)
        if replace_json or not path.exists(path + '.json.xz'):
            L.save_metadata_json(path, post)

In [7]:
# Note: if the json subfolder exists from a previous run, it may contain files corresponding to posts that aren't in the five
# most recent posts of a given influencer anymore. So, delete the json subfolder
json_path = os.path.join(os.getcwd(), 'json')
if path.exists(json_path):
   rmtree('json')

In [8]:
for influencer in influencer_list:
    user_to_json(influencer, 5)

Downloading posts from fooddolls
json json json json json Downloading posts from nycfoodcoma
json json json json json Downloading posts from therealfoodrds
Downloading posts from dadaeats
json json json json json Downloading posts from wholesome_lee
json json json json json Downloading posts from upbeetandkaleingit
json json json json json Downloading posts from fitmittenkitchen
json json json json json Downloading posts from purely_elizabeth
json json json json json Downloading posts from healthymoodsf
json json json json json Downloading posts from thecuttingveg
json json json json json Downloading posts from mississippivegan
json json json json json Downloading posts from byanjushka
json json json json json Downloading posts from laurafruitfairy
json json json json json Downloading posts from lastingredient
json json json json json Downloading posts from localhaven
json json json json json Downloading posts from melissas_healthykitchen
json json json json json Downloading posts from

In [9]:
# Function to grab objects of interest from post object

def objects_from_post(post):
    # Shortcode
    shortcode = 'https://www.instagram.com/p/' + post.shortcode
    
    # Date
    postdate = post.date
    
    # Profile
    profile = post.owner_profile
    
    # Username of post's owner
    profile_username = post.owner_username
    
    # Extract location
    location = post.location
    #location = None
    
    if not location:
        loc_name = float('nan')
        loc_lng = float('nan')
        loc_lat = float('nan')
    else:
        loc_name = location.name
        loc_lng = location.lng
        loc_lat = location.lat
        
    # Extract image URL
    image_url = post.url
    
    # Number of likes
    likes_num = post.likes 
    
    # Number of comments
    comments_num = post.comments
    
    # Find ID of likes
    #postlikes = []
    #for likes in post.get_likes():
    #    postlikes.append(likes)
    
    # Extract caption
    # Caption
    caption = post.caption
    
    # Caption hashtag
    caption_hashtags = post.caption_hashtags
    
    # Caption mentions (profiles mentioned in caption)
    caption_mention = post.caption_mentions
    
    # Whether the post is sponsored (i.e. "Paid partnership with...")
    sponsored = post.is_sponsored
    
    # List of the post's sponsors (usernames)
    sponsors = [p.username for p in post.sponsor_users]
    
    # Number of followers
    followers_num = profile.followers
    
    # Chronologically earliest comment, to search for hashtags. Influencers sometimes put hashtags in a separate comment,
    # usually the first comment on the post. Note that post.get_comments() does not necessarily return
    # the chronologically earliest comment as the first item.
#     start = time.time()
#     first_comment = min(post.get_comments(), key = lambda p: p.created_at_utc)
#     if first_comment:
#         first_comment_text = first_comment.text
        
#         # Is the first comment by the owner of the original post?
#         first_comment_by_owner = (first_comment.owner.username == profile_username)
        
#         # If the first comment is by the owner of the original post, get the (unique) hashtags from the first comment
#         first_comment_hashtags = list(set(part[1:] for part in first_comment_text.split() if part.startswith('#')))
#     else:
#         first_comment_text = float('nan')
#         first_comment_by_owner = float('nan')
#         first_comment_hastags = float('nan')
        
#     end = time.time()
#     print('Getting first comment hashtags took', str(end - start), 'seconds')
    
    if post.comments > 0:
        owner_comments = (c for c in post.get_comments() if c.owner.username == profile_username)
        owner_comment_hashtags = [part[1:] for c in owner_comments for part in c.text.split() if part.startswith('#')]
        owner_comment_hashtags_unique = list(set(owner_comment_hashtags))
    else:
        owner_comment_hashtags_unique = float('nan')
    
    data = {'shortcode': shortcode,
            'date': postdate,
            'profile_username': profile_username,
            'location_name': loc_name,
            'location_lat': loc_lat,
            'location_lng': loc_lng,
            'image_url': image_url,
            'likes_num': likes_num,
            'comments_num': comments_num,
            'caption': caption,
            'caption_hashtags': caption_hashtags,
            'caption_mention': caption_mention,
            'sponsored': sponsored,
            'sponsors': sponsors,
            'followers_num': followers_num,
            'owner_comment_hashtags': owner_comment_hashtags_unique}
        
    return data

In [10]:
def empty_dict(shortcode):
    the_dict = {'shortcode': shortcode, 
                'date': float('nan'),
                'profile_username': float('nan'),
                'location_name': float('nan'),
                'location_lat': float('nan'),
                'location_lng': float('nan'),
                'image_url': float('nan'),
                'likes_num': float('nan'),
                'comments_num': float('nan'),
                'caption': float('nan'),
                'caption_hashtags': float('nan'),
                'caption_mention': float('nan'),
                'sponsored': float('nan'),
                'sponsors': float('nan'),
                'followers_num': float('nan'),
                'owner_comment_hashtags': float('nan')}
    return the_dict

In [11]:
mypath = os.path.join(os.getcwd(), 'json')
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

# Only get jsonfiles
index = []
for string in onlyfiles:
    index.append(string.endswith('.xz'))
jsonfiles = list(itertools.compress(onlyfiles, index))

count = 0
list_dicts = []

for file in jsonfiles:
    count += 1
    print('Post {} of {}\n'.format(count, len(jsonfiles)))

    post = instaloader.load_structure_from_file(L.context, os.path.join(mypath, file))
    # To handle a postexception error
    try:
        list_dicts.append(objects_from_post(post))
    except Exception as e:
        print('Error getting post information:', e)
        list_dicts.append(empty_dict(post.shortcode))

df = pd.DataFrame(list_dicts)
#df = df.join(pd.DataFrame({'company':[company] * len(list_dicts)}))
#df = df.join(pd.DataFrame({'type':[post_type] * len(list_dicts)}))

# Encoding needs to be UTF8-sig, otherwise apostrophes, emojis etc. get messed up
df.to_csv('data.csv', encoding = 'utf-8-sig') 

# We round up so we minus a day
#timeseries = df.groupby(df['date'].dt.round("D") - datetime.timedelta(days = 1)).sum()
#timeseries.to_csv(company + "/output/" + company + "_" + post_type + "_ts.csv")
# return df, timeseries

Post 1 of 255

Post 2 of 255

Post 3 of 255

Post 4 of 255

Post 5 of 255

Post 6 of 255

Post 7 of 255

Post 8 of 255

Post 9 of 255

Post 10 of 255

Post 11 of 255

Post 12 of 255

Post 13 of 255

Post 14 of 255

Post 15 of 255

Post 16 of 255

Post 17 of 255

Post 18 of 255

Post 19 of 255

Post 20 of 255

Post 21 of 255

Post 22 of 255

Post 23 of 255

Post 24 of 255

Post 25 of 255

Post 26 of 255

Post 27 of 255

Post 28 of 255

Post 29 of 255

Post 30 of 255

Post 31 of 255

Post 32 of 255

Post 33 of 255

Post 34 of 255

Post 35 of 255

Post 36 of 255

Post 37 of 255

Post 38 of 255

Post 39 of 255

Post 40 of 255

Post 41 of 255

Post 42 of 255

Post 43 of 255

Post 44 of 255

Post 45 of 255

Post 46 of 255

Post 47 of 255

Post 48 of 255

Post 49 of 255

Post 50 of 255

Post 51 of 255

Post 52 of 255

Post 53 of 255

Post 54 of 255

Post 55 of 255

Post 56 of 255

Post 57 of 255

Post 58 of 255

Post 59 of 255

Post 60 of 255

Post 61 of 255

Post 62 of 255

Post 63 of 255

P

JSON Query to explore/locations/2019740261589745/: HTTP error code 560. [retrying; skip with ^C]


Post 110 of 255

Post 111 of 255

Post 112 of 255

Post 113 of 255

Post 114 of 255

Post 115 of 255

Post 116 of 255

Post 117 of 255

Post 118 of 255

Post 119 of 255

Post 120 of 255

Post 121 of 255

Post 122 of 255

Post 123 of 255

Post 124 of 255

Post 125 of 255

Post 126 of 255

Post 127 of 255

Post 128 of 255

Post 129 of 255

Post 130 of 255

Post 131 of 255

Post 132 of 255

Post 133 of 255

Post 134 of 255

Post 135 of 255

Post 136 of 255

Post 137 of 255

Post 138 of 255

Post 139 of 255

Post 140 of 255

Post 141 of 255

Post 142 of 255

Post 143 of 255

Post 144 of 255

Post 145 of 255

Post 146 of 255

Post 147 of 255

Post 148 of 255

Post 149 of 255

Post 150 of 255

Post 151 of 255

Post 152 of 255

Post 153 of 255

Post 154 of 255

Post 155 of 255

Post 156 of 255

Post 157 of 255

Post 158 of 255

Post 159 of 255

Post 160 of 255

Post 161 of 255

Post 162 of 255

Post 163 of 255

Post 164 of 255

Post 165 of 255

Post 166 of 255

Post 167 of 255

Post 168 of 25