In [200]:
from selenium.webdriver import Chrome # !pip install selenium
from random import random
import time
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from uuid import uuid4
import tensorflow as tf #!pip install tensorflow
import matplotlib.pyplot as plt
from PIL import Image
from tensorflow.keras.applications import MobileNetV2
import numpy as np
import os
import pandas as pd

In [162]:
def get_posts(hashtags, n, browser):
    start = time.time()
    # List to hold dictionaries with necessary identifying information for each instagram photo
    posts = []
    
    # Iterate through each hashtag
    for hashtag in hashtags:
        # URL for that hashtag
        url = f"https://www.instagram.com/explore/tags/{hashtag}/"
        browser.get(url)
        post = "https://www.instagram.com/p/"
        
        # empty lists to hold information from that hashtag
        post_links = []
        images = []
        # Run until you have collected n images
        while len(post_links) < n or len(images) < n:

            img_src = [
                img.get_attribute('src')
                for img in browser.find_elements_by_css_selector('article img')
            ]
            links = [
                a.get_attribute('href') for a in browser.find_elements_by_tag_name('a')
            ]

            for link in links:
                if post in link and link not in post_links and len(post_links) < n:
                    post_links.append(link)
            for image in img_src:
                if image not in images and len(images) < n:
                    images.append(image)

            scroll_down = 'window.scrollTo(0, document.body.scrollHeight);'
            browser.execute_script(scroll_down)
            time.sleep(1 + (random() * 5))

        # Create a list of dictionaries containing the link to each post
        # a link the the .jpg version of that image
        # and the hashtag that was used to search it
        posts += [{'post_link': post_links[i],
                'image': images[i],
                'search_hashtag': hashtag} for i in range(len(post_links))]
    end = time.time()
    print(f'Runtime: {end - start} to get {n*len(hashtags)} posts')
    return posts

def get_meta(posts, browser):
    start = time.time()
    # create a list of indices of which dictionaries need to be dropped
    posts_update = []
    # Iterate through each post in list
    for post in posts:
        # assign the url of the post
        url = post['post_link']
        tags = []
        try:
            response = requests.get(url)
            data = response.text
            soup = BeautifulSoup(data, 'html.parser')
            head = soup.find('head')
            hashtags = head.find_all('meta', attrs={'property':'instapp:hashtags'})
            for hashtag in hashtags:
                tags.append(hashtag.get('content'))
            
            # check if tags is empty
            if not tags:
                # remove that post from the list
                print('post had no hashtags')
                
            else:
                # add entry to dictionary for that post
                post['tags'] = tags
                
                # Check if post is a photo or video and only keep photos
                try:
                    browser.get(url)
                    likes = int(browser.find_element_by_xpath("""/html/body/div[1]/section/main/div/div[1]/article/div[3]/section[2]/div/div[1]/button/span""").text)
                    # add number of likes to dictionary
                    post['likes'] = likes
                    # add datetime of post to dictionary
                    posts_update.append(post)
                    post['datetime'] = browser.find_element_by_xpath("""/html/body/div[1]/section/main/div/div[1]/article/div[3]/div[2]/a/time""").get_attribute('datetime')
                except:
                    # That post was a link to a video, not an image
                    print('post is video, not image')
                # pause before making next request    
                time.sleep(1 + (random() * 5))
        except:
            print('failed url: {}'.format(url))
    end = time.time()
    print(f'Runtime: {end - start} to get metadata for {len(posts_update)} posts')
    return posts_update

def get_images(posts):
    start = time.time()
    # rare occurance of NoneType in 'image' observation
    # Create exception to handle those instances and remove those posts
    posts_update = []
    """Download images from given url and add their names to dictionary
    note to self: Write a way to check to make sure a folder for that hashtag category exists"""
    for post in posts:
        uuid = uuid4()
        try:
            urlretrieve(str(post['image']), f"/Users/kacikus/Dropbox/Thinkful_Data_Science_Projects/Capstone4/images/{uuid}.jpg")
            name = f"{uuid}.jpg"
            post['name'] = name  
            posts_update.append(post)      
        except:
            print('exception encountered')
            pass
        
    end = time.time()
    print(f'Runtime: {end - start} to download {len(posts_update)} posts')
    return posts_update

def prepare_images(posts, neural_network, height=160, width=160):
    start = time.time()
    for post in posts:
        path = f"/Users/kacikus/Dropbox/Thinkful_Data_Science_Projects/Capstone4/images/{post['name']}"
        trash = f"/Users/kacikus/.Trash/{post['name']}"
        img = tf.io.read_file(path)
        img = tf.image.decode_image(img)
        img = tf.cast(img, tf.float32)
        img = (img/127.5) - 1
        img = tf.image.resize(img, (height, width))
        # reshape grayscale images to match dimensions of color images
        if img.shape != (160, 160, 3):
            img = tf.concat([img, img, img], axis = 2)
        post['pic'] = img
        # delete image to save space
        os.remove(path)
        # delete image from trash
        #os.remove(trash)
        
        # get vector of 1280 deep features 
        img_np = np.expand_dims(img.numpy(), axis=0)
        deep_features = neural_network.predict(img_np)[0]
        post['deep_features'] = deep_features
    end = time.time()
    print(f'Runtime: {end - start} to process images and get deep features')
    return posts

def get_data(hashtags, n, browser):
    posts = get_posts(hashtags, n, browser)
    posts = get_meta(posts, browser)
    return pd.DataFrame(posts)

In [201]:
# create Neural Network
img_shape = (160, 160, 3)

# create the base model from the pre-trained model MobileNet V2
base_model = MobileNetV2(input_shape = img_shape, include_top = False,
                        weights = 'imagenet')
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

neural_network = tf.keras.Sequential([
    base_model,
    global_average_layer
])

In [None]:
df1 = pd.DataFrame(posts)
df1.to_pickle('/Users/kacikus/Dropbox/Thinkful_Data_Science_Projects/Capstone4/processed_data.pkl')

# Collect new data
1. Adjust hashtags
2. Change 'n' to equal how many images per hashtag you want to collect

In [99]:
# collect data
hashtags = ['fitness','food','nature','mechanicalkeyboard']
browser = Chrome(executable_path='/Users/kacikus/Dropbox/Thinkful_Data_Science_Projects/Capstone4/chromedriver 2')
df = get_data(hashtags,n = 500, browser)
#df = pd.concat([df1,df2]).drop(columns=['name']).drop_duplicates(subset='post_link').reset_index(drop=True)

Runtime: 5192.160176038742 to get 2000 posts
post is video, not image
post is video, not image
post had no hashtags
post had no hashtags
post is video, not image
post had no hashtags
post is video, not image
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post is video, not image
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post is video, not image
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post ha

post is video, not image
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post is video, not image
post had no hashtags
post had no hashtags
post is video, not image
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post is video, not image
post is video, not image
post had no hashtags
post had no hashtags
post is video, not image
post is video, not image
post had no hashtags
post is video, not image
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post is video, not image
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post is video, not image
post had no hashtags
post had no hashtags
post had no hashtags
post had no hashtags
post is video, not image
post is video, not image
post is video, not image
post is video, not image
post is video, not image
post had no hashtags
post had no hashtags
post is video, n

In [167]:
#df = pd.read_csv('/Users/kacikus/Dropbox/Thinkful_Data_Science_Projects/Capstone4/data.csv')
# convert dataframe back to list of dictionaries
posts = [df.iloc[i].to_dict() for i in range(len(df))]
# split up into lists of 50 entries at a time or fewer
chunks = [posts[x:x+100] for x in range(0, len(posts), 100)]

In [168]:
for chunk in chunks:
    chunk = get_images(chunk)
    chunk = prepare_images(chunk, neural_network = neural_network)

Runtime: 7.024293899536133 to download 100 posts
Runtime: 9.778025150299072 to process images and get deep features
Runtime: 7.958451986312866 to download 100 posts
Runtime: 8.393651962280273 to process images and get deep features
Runtime: 5.816488027572632 to download 100 posts
Runtime: 8.176470756530762 to process images and get deep features
Runtime: 6.480806112289429 to download 100 posts
Runtime: 7.971073150634766 to process images and get deep features
Runtime: 6.768360137939453 to download 100 posts
Runtime: 8.108715057373047 to process images and get deep features
Runtime: 6.763459205627441 to download 100 posts
Runtime: 8.102815866470337 to process images and get deep features
Runtime: 6.6255106925964355 to download 100 posts
Runtime: 8.04637622833252 to process images and get deep features
Runtime: 8.78427004814148 to download 100 posts
Runtime: 7.874860048294067 to process images and get deep features
Runtime: 7.53360915184021 to download 100 posts
Runtime: 7.54395103454589

In [169]:
# join chunks into single list again
posts = [post for chunk in chunks for post in chunk]

In [194]:
# Load previous data
df_prev = pd.read_pickle('/Users/kacikus/Dropbox/Thinkful_Data_Science_Projects/Capstone4/processed_data.pkl')
# convert posts to df
df_new = pd.DataFrame(posts).drop(columns='name')

In [195]:
df_prev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3748 entries, 0 to 3747
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   post_link       3748 non-null   object
 1   image           3748 non-null   object
 2   search_hashtag  3748 non-null   object
 3   tags            3748 non-null   object
 4   likes           3748 non-null   int64 
 5   datetime        3748 non-null   object
 6   pic             3748 non-null   object
 7   deep_features   3748 non-null   object
dtypes: int64(1), object(7)
memory usage: 234.4+ KB


In [196]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1576 entries, 0 to 1575
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   post_link       1576 non-null   object
 1   image           1575 non-null   object
 2   search_hashtag  1576 non-null   object
 3   tags            1576 non-null   object
 4   likes           1576 non-null   int64 
 5   datetime        1576 non-null   object
 6   pic             1575 non-null   object
 7   deep_features   1575 non-null   object
dtypes: int64(1), object(7)
memory usage: 98.6+ KB


In [197]:
# Convert posts back to dataframe
# Some of the posts still had the unecessary 'Unnamed' colunn, so drop that as well
# drop name column since we have deleted the pictures
df = pd.concat([df_prev, df_new]).reset_index(drop = True)

In [198]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5324 entries, 0 to 5323
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   post_link       5324 non-null   object
 1   image           5323 non-null   object
 2   search_hashtag  5324 non-null   object
 3   tags            5324 non-null   object
 4   likes           5324 non-null   int64 
 5   datetime        5324 non-null   object
 6   pic             5323 non-null   object
 7   deep_features   5323 non-null   object
dtypes: int64(1), object(7)
memory usage: 332.9+ KB


In [199]:
# save as pickle
name = 'update_processed_data.pkl'
df.to_pickle(f'/Users/kacikus/Dropbox/Thinkful_Data_Science_Projects/Capstone4/{name}')

In [94]:
df['tags']

0       ['cats_of_instagram', 'catstagramcat', 'admire...
1       ['catsofinstagram', 'cats_of_instagram', 'catl...
2       ['blackandwhitecatsofinstagram', 'petsofinstag...
3       ['cats', 'cattos', 'cats_of_instagram', 'sadca...
4       ['miaou', 'catlife', 'queenpika', 'cat', 'chat...
                              ...                        
3743    ['tomandjame', 'future', 'amsterdamdanceevent'...
3744    ['少女前線', 'cute', 'girlsfrontline', 'animegirl'...
3745    ['codingbootcamp', 'startuplife', 'dev', 'codi...
3746    ['sea', 'vendée', 'departementvendee', 'boat',...
3747    ['computers', 'puzzle', 'puzzles', 'technology...
Name: tags, Length: 3748, dtype: object