## In this notebook, images and their corresponding metadata are organized. We take note of the actual existing images, combine with available metadata, and scraped follower counts. After merging and dropping image duplicates, we obtain 7702 total images.

In [10]:
import pandas as pd
import numpy as np
import os
from PIL import Image
import json
from pandas.io.json import json_normalize
import ast

IMAGE_DIR = "./images/training/resized/"

### Dataframe (df_imagename) of all existing images: 11181 Images


In [11]:
# Directory of museum folders
im_dirs = os.listdir(IMAGE_DIR)

folder = []
for f in im_dirs:
    if f != '.DS_Store':
        print(IMAGE_DIR+f)
        folder = folder + os.listdir(IMAGE_DIR+f)   
        
# df_imagename : Dataframe of existing images        
df_imagename = pd.DataFrame({"filename": folder})
df_imagename.head()
print("Number of existing images: {}".format(df_imagename.filename.size))

./data/data/cablausanne
./data/data/elyseemusee
./data/data/espacearlaud
./data/data/hermitage_lausanne
./data/data/mcbalausanne
./data/data/mudaclausanne
./data/data/olympicmuseum
Number of existing images: 11181


In [12]:
# Takes metadata for museum and returns a dataframe
def load_metadata(file, folder):
    data = json.load(file)
    df = pd.DataFrame.from_dict(json_normalize(data), orient = 'columns')
    df['museum'] = folder
    df = df.rename(index=str, columns={"id": "insta_id"})
    df.drop(labels = ['comments_disabled', 'edge_media_preview_like.count',
       'edge_media_to_caption.edges', 'edge_media_to_comment.count', 'is_video', 'thumbnail_resources', 'thumbnail_src', 'urls',
       'video_view_count'], axis = 1, inplace = True)
    df['display_url'] = df['display_url'].str.split('/').str[-1]
    return df
       

### Dataframe (df) of images in metadata: Metadata for 8362 images


In [21]:
# Load all the metadata
df = pd.DataFrame()
for folder in im_dirs:
    if folder != ".DS_Store":
        print("Loading {} images".format(folder))
        meta_file = open("{image_dir}{folder}/{folder}.json".format(image_dir=IMAGE_DIR, folder = folder))
        if df.empty:
            df = load_metadata(meta_file, folder)
        else:
            df = pd.concat([df, load_metadata(meta_file, folder)], ignore_index = True)
        columns = ['height',
                   'width',
                   'filename',
                   'liked_count',
                   'insta_id',
                   'user_id',
                   'shortcode',
                   'tags',
                   'timestamp',
                   'museum'] 
    
df.to_csv('./images/training/data/merged_metadata.csv', header = columns)
df.head()
print("Number of images in metadata: {}".format(df.shortcode.size))

Loading cablausanne images
Loading elyseemusee images
Loading espacearlaud images
Loading hermitage_lausanne images
Loading mcbalausanne images
Loading mudaclausanne images
Loading olympicmuseum images
Number of images in metadata: 8362


## Script for scraping follower counts. Some of the shortcodes used were not valid, possibly because the images were removed.

In [14]:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from lxml import html

import csv
def write_out_csv(data, filename_base, fieldnames):
    print("Writing to output file %s.csv" % filename_base)
    with open("%s.csv" % filename_base, "w") as csvfile:
        fields = fieldnames
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()
        for row in data:
            writer.writerow(row)
            
def scrape_followers(lst, output_filename):
    instagram_data = []
    error_sc = []
    for code in lst:
        url = "https://instagram.com/p/" + code
        try:
            browser.get(url)
            elem = wait.until(
                EC.element_to_be_clickable(
                    (By.XPATH, '//div[@class = "e1e1d"]//a[@class = "FPmhX notranslate nJAzx"]')
                )
            )

            elem.click()
            elem = wait.until(
                EC.element_to_be_clickable((By.XPATH, '//div[@class = "v9tJq "]'))
            )
            el = browser.find_element_by_xpath("//*")
            parser = html.fromstring(el.get_attribute("outerHTML"))
            # print(el.get_attribute("outerHTML"))
            raw_followers = parser.xpath(
                './/ul[@class="k9GMp "]/li[position()=2]//span[@class = "g47SY "]/@title'
            )[0].replace(",", "")

            data = {"shortcode": code, "followers": int(raw_followers)}
            instagram_data.append(data)
        except:
            error_sc.append(code)
            pass


    browser.close()
    fields = ["shortcode", "followers"]
    print(error_sc)
    write_out_csv(instagram_data, "{}".format(output_filename), fields)


In [None]:
# Uncomment the code below to run scraping for a list of shortcodes
# Load the shortcodes of images for which the followers was not scraped

# with open('error_sc4.txt', 'r') as f:
#     error_sc4 = ast.literal_eval(f.read())
    
# print(len(error_sc4))

# browser = webdriver.Chrome()
# wait = WebDriverWait(browser, 15)
# scrape_followers(error_sc3, "followers4")

### Dataframe (df_followers) of follower number for each shortcode: 8138 counts, 8068 shortcodes are unique


In [24]:
# Follower counts are merged
# lst_followers = [pd.read_csv("followers.csv"), pd.read_csv("followers2.csv"), pd.read_csv("followers3.csv"), pd.read_csv("followers4.csv")]
# df_followers = pd.concat(lst_followers, ignore_index = True)
# df_followers.to_csv("scraped_follower_counts.csv")


# Follower count df: df_followers
# Metadata df: df_images
df_followers = pd.read_csv("./images/training/data/scraped_follower_counts.csv")
df_images = pd.read_csv("./images/training/data/merged_metadata.csv")

In [25]:
print("Number of Follower counts", df_followers.shortcode.size)
print("Number of Follower counts based on unique shortcodes", df_followers.shortcode.unique().size)
print("Number of Images with metadata", df_images.shortcode.size)
print("Number of actual Images", df_imagename.size)

Number of Follower counts 8138
Number of Follower counts based on unique shortcodes 8068
Number of Images with metadata 8362
Number of actual Images 11181


### Dataframe (df_final): merge metadata with scraped followers counts.

In [26]:
df_final = df_followers.merge(df_images, on = "shortcode")

In [27]:
print("From Metadata - Number of unique filenames: {}".format(df_images.filename.unique().size))
print("From Metadata - Number of filenames: {}".format(df_images.filename.size))
print("Metadata + Followers - Number of unique filenames : {}".format(df_final.filename.unique().size))
print("Metadata + Followers - Number of filenames: {}".format(df_final.filename.size))

From Metadata - Number of unique filenames: 8292
From Metadata - Number of filenames: 8362
Metadata + Followers - Number of unique filenames : 8068
Metadata + Followers - Number of filenames: 8280


In [28]:
df_final.drop_duplicates(subset = ["shortcode"], inplace = True)
df_final.shortcode.unique().size
df_final.shortcode.size
df_final['score'] = df_final.liked_count/df_final.followers
df_final = df_final[df_final['score'] != float('inf')]
print("min: {}, max: {}".format(min(df_final.score), max(df_final.score)))
df_final['norm_score'] = (df_final['score'] - min(df_final.score))/(max(df_final.score) - min(df_final.score))
print("normalized - min: {}, max: {}".format(min(df_final.norm_score), max(df_final.norm_score)))

df_final.head()

min: 0.0, max: 5.5
normalized - min: 0.0, max: 1.0


Unnamed: 0,Unnamed: 0_x,shortcode,followers,Unnamed: 0_y,height,width,filename,liked_count,insta_id,user_id,tags,timestamp,museum,score,norm_score
0,0,BpmIyI4D1B7,1027,0,1080,1080,43375222_360744377830252_9185063361956096357_n...,7,1902246522487001211,1807804349,"['drawing', 'drawingfreemovement', 'lesartsdes...",1540985478,cablausanne,0.006816,0.001239
4,1,BpmI8l3j0Hj,1027,1,1350,1080,42976666_185886285649615_1276219388339275027_n...,15,1902247240811889123,1807804349,"['drawing', 'drawingfreemovement', 'lesartsdes...",1540985563,cablausanne,0.014606,0.002656
8,2,BpmI3oUDa4D,1027,2,1080,1080,43191815_456314964775007_4913857480498242612_n...,13,1902246899840036355,1807804349,"['drawing', 'drawingfreemovement', 'lesartsdes...",1540985523,cablausanne,0.012658,0.002301
12,3,BpmIpHdD4LG,1027,3,1080,1080,43913715_324577288333877_7336630694004952687_n...,6,1902245902484996806,1807804349,"['drawing', 'drawingfreemovement', 'lesartsdes...",1540985404,cablausanne,0.005842,0.001062
16,4,BpmItgcDxEu,1027,4,1080,1080,44253033_301748290647731_1721608056805115343_n...,6,1902246204189643054,1807804349,"['drawing', 'drawingfreemovement', 'lesartsdes...",1540985440,cablausanne,0.005842,0.001062


### Dataframe (df_final) -- existing images merged with metadata images: 

In [30]:
df_final = df_final.merge(df_imagename, on="filename")
print(df_imagename.filename.unique().size)
print(df_imagename.filename.size)
df_final.filename.unique().size
df_final = df_final.sort_values(by = "score", ascending=False)[['filename', 'museum', 'score', 'liked_count', 'followers', 'norm_score']]
df_final.drop_duplicates(subset = "filename", inplace = True)
print("Number of existing images merged with metadata: {}".format(df_final.filename.size))
df_final.to_csv('./images/training/data/image_data_final.csv')

11161
11181
Number of existing images merged with metadata: 7702


In [28]:
df_final.read_csv('./images/training/data/image_data_final.csv')

In [33]:
df.filename.size

7702

In [31]:
# Dataframe of follower counts
df_followers = pd.read_csv("./images/training/data/scraped_follower_counts.csv")
df_followers.head()

Unnamed: 0.1,Unnamed: 0,shortcode,followers
0,0,BpmIyI4D1B7,1027
1,1,BpmI8l3j0Hj,1027
2,2,BpmI3oUDa4D,1027
3,3,BpmIpHdD4LG,1027
4,4,BpmItgcDxEu,1027


In [32]:
# Dataframe of metadata
df_images = pd.read_csv("./images/training/data/merged_metadata.csv")
df_images.head()

Unnamed: 0.1,Unnamed: 0,height,width,filename,liked_count,insta_id,user_id,shortcode,tags,timestamp,museum
0,0,1080,1080,43375222_360744377830252_9185063361956096357_n...,7,1902246522487001211,1807804349,BpmIyI4D1B7,"['drawing', 'drawingfreemovement', 'lesartsdes...",1540985478,cablausanne
1,1,1350,1080,42976666_185886285649615_1276219388339275027_n...,15,1902247240811889123,1807804349,BpmI8l3j0Hj,"['drawing', 'drawingfreemovement', 'lesartsdes...",1540985563,cablausanne
2,2,1080,1080,43191815_456314964775007_4913857480498242612_n...,13,1902246899840036355,1807804349,BpmI3oUDa4D,"['drawing', 'drawingfreemovement', 'lesartsdes...",1540985523,cablausanne
3,3,1080,1080,43913715_324577288333877_7336630694004952687_n...,6,1902245902484996806,1807804349,BpmIpHdD4LG,"['drawing', 'drawingfreemovement', 'lesartsdes...",1540985404,cablausanne
4,4,1080,1080,44253033_301748290647731_1721608056805115343_n...,6,1902246204189643054,1807804349,BpmItgcDxEu,"['drawing', 'drawingfreemovement', 'lesartsdes...",1540985440,cablausanne


In [34]:
# Final dataframe of images that are existing and have follower counts and metadata
df_final = pd.read_csv('./images/training/data/image_data_final.csv')
df_final.head()

Unnamed: 0.1,Unnamed: 0,filename,museum,score,liked_count,followers,norm_score
0,0,10643895_351134671757704_1580281853_n.jpg,olympicmuseum,5.5,11,2,1.0
1,1,21689448_159316801317448_8056597609155919872_n...,hermitage_lausanne,4.0,60,15,0.727273
2,2,38481848_182395105874863_2128305798684606464_n...,olympicmuseum,3.142857,22,7,0.571429
3,3,42186745_148035649484120_738291455511040130_n.jpg,olympicmuseum,2.375,19,8,0.431818
4,4,13712125_1103474473079329_975621396_n.jpg,olympicmuseum,2.107143,59,28,0.383117
