In [1]:
from tqdm.notebook import tqdm
import pickle
from instascrape import Profile, scrape_posts 
from selenium.webdriver import Chrome
from collections import defaultdict
import pandas as pd
import numpy as np
import os
import time
import random

## Loading posts objects

In [5]:
def getPosts(accountNames, driverpath="/Users/inotin/Dropbox/DS/Personal/imageRecognition/chromedriver", 
             sessionId = '17483229%3AsvgO9DB4dOd9T8%3A0', numOfPosts = 300):
    """
    The function returns a dictionary with account names as keys and list of Post objects as value
    Input:
    accountNames (list(str)): list of instagram account names to be scraped
    driverpath (str):         path to Selenium webdriver, default: "/chromedriver"
    sessionId (str):          Session ID which is accepted by Instagram and allows scraping the data.
                              It can be found in cookies section of developer's tools of your browser 
                              with opened instagram page 
    numOfPosts (int):         Number of Post objects to be downoaded for each account
    
    Output:
    dictionary {'account1':[Post1,Post2,...], 'account2':[Post1, Post2,...],...}
    """
    headers = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36 Edg/87.0.664.57",
           "cookie": f"sessionid={sessionId};"}
    webdriver = Chrome(driverpath)
    dfComplete = pd.DataFrame()
    
    postsDct = defaultdict(list)
    for account in accountNames:
        postsList = []
        profile = Profile(account)
        profile.scrape(headers=headers)
        posts = profile.get_posts(amount=numOfPosts, webdriver=webdriver, login_first= True)
        postsDct[account].append(posts)
        
        print(f"Posts loaded for {account}:", len(posts))
        fileNameTail = '_'.join(accountNames)
    with open(f'postsList_{int(time.time())}_{fileNameTail}.pkl', 'wb') as f:
        pickle.dump(postsDct, f)
        
#     dct = defaultdict(list)
#     for posts in postsList:
#         for post in tqdm(posts):
#             try:
#                 post.scrape(headers=headers)
#                 d = post.to_dict()
#                 for k in d:
#                     dct[k].append(d[k])
#                 dct['timestepOfDownloading'].append(time.time())
#             except:
#                 print('skipped', post)
#         dfM = pd.DataFrame(dct)
#         dfM.to_pickle(f'{int(time.time())}_{account}.pkl')
#         dfComplete=pd.concat([dfComplete, dfM])
#     dfComplete.to_pickle(f'{int(time.time())}_complete.pkl')
    
    return postsDct

In [6]:
def scrapePosts(account, posts, 
                SESSIONID = '17483229%3AOHlfpimTdGEoN4%3A21', 
                additionalSessionIDs = ['17483229%3AOHlfpimTdGEoN4%3A21']):
    """
    The function returns a dataframe containing columns with metadata obtained from Post objects in posts list
    for certain account.
    
    Input:
    account (str):               account name
    posts (list(Post)):          list of Post objects
    SESSIONID (str):             Session ID which is accepted by Instagram and allows scraping the data.
                                 It can be found in cookies section of developer's tools of your browser 
                                 with opened instagram page
    additionalSessionIDs (list): list of additional session id's (strings) which can be used during 
                                 scraping if primary session ID is rejected.
    
    Output:
    pandas dataframe
    
    """
    dct = defaultdict(list)
    headers = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36 Edg/87.0.664.57",
               "cookie": f"sessionid={SESSIONID};"}
    cntSkipped = 0 #Counter of skipped posts
    
    for i,post in tqdm(enumerate(posts), total = len(posts)):
        try:
            time.sleep(5*random.random()) #This adds some randomness which is useful to avoid blockinf from Instagram
            post.scrape(headers=headers)
            d = post.to_dict()
            for k in d:
                dct[k].append(d[k])
            dct['loadTimestamp'].append(time.time()) #Current time to know when it was loaded
            if i!=0 and i%100 == 0: #Every 100 scraped posts are saved separately, kind of checkpoints
                dfM = pd.DataFrame(dct)
                dfM.to_pickle(f'{int(time.time())}_{account}_{i}.pkl')
                print(f'{int(time.time())}_{account}_{i}.pkl saved successfully')
        except:
            cntSkipped+=1
            print(f'skipped in total: {cntSkipped}')
            if cntSkipped>=5:
                #If Instagram rejects requests, I try to change Session ID
                headers = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36 Edg/87.0.664.57",
               "cookie": f"sessionid={random.sample(additionalSessionIDs,1)[0]};"}
                
    dfM = pd.DataFrame(dct)
    dfM.to_pickle(f'{int(time.time())}_{account}.pkl') #This is the entire dataframe
    return dfM

In [11]:
listOfAccounts = ['visitcalifornia','visitseattle', 'japantravelcom', 'visit_singapore']
postsDictionary = getPosts(listOfAccounts, numOfPosts=400)

Posts loaded for visitcalifornia: 400
Posts loaded for visitseattle: 400
Posts loaded for japantravelcom: 400
Posts loaded for visit_singapore: 400


In [3]:
# #Loading previously saved posts
# with open('postsList_1622978031_visitcalifornia_visitseattle_japantravelcom_visit_singapore.pkl', 'rb') as f:
#     postsDictionary = pickle.load(f)

In [None]:
dfComplete = pd.DataFrame()
for account in postsDictionary:
    if account == 'japantravelcom' or account=='visit_singapore':
        pd.concat([dfComplete, scrapePosts(account, postsDictionary[account][0])])
dfComplete

HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))

skipped in total: 1
skipped in total: 2
skipped in total: 3
skipped in total: 4
