# Basic Outline of the project:

**Goal:**
Increase turnout of young black people in the 2020 Georgia runoff election.  

**How:**
Identify local IG influencers in majority black zip codes and ask them to post pro-voting messages to their IG pages 

**Steps to indentify local IG influencers:** 
* go to http://zipatlas.com/us/ga/zip-code-comparison/percentage-black-population.htm, download the main table
    * for now, we're just using the first page of the table. 
* Calculate total black population in each zip code, then sort by black population descending.
* take the first 20 zip codes (for now, could be more later), search the Yelp API, download the first 100 locations for each zip code.
    * Remove duplicates
    * Remove chains
* Enter each location's name in the IG search bar, copy the first url that shows up.
    * Drop urls to hashtags, locations, etc. 
* Search the IG handles you've collected, download the JSON file for the profile, collect relevant information, calculate influencer scores.
    * Drop private accounts, accounts with no posts, and accounts that were deleted from when you first got the handle and when you're downloading more information (weird, but it happened).  If there's other corner cases to deal with, we'll cross that bridge when we get to it.
    

In [157]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

import re

import time
import random

from datetime import datetime, timedelta
import json

import os

import numpy as np

from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 

import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
#from nltk.corpus import wordnet

import string

stop_words = stopwords.words('english')

import spacy
nlp = spacy.load('en_core_web_md')

pd.set_option('display.max_rows', None)


**Get the table from zip atlas**
http://zipatlas.com/us/ga/zip-code-comparison/percentage-black-population.htm

In [2]:
url = 'http://zipatlas.com/us/ga/zip-code-comparison/percentage-black-population.2.htm'

zip_tables = pd.read_html(url, match="Location")

In [60]:
len(zip_tables)

4

**Need to iterate through all the tables returned.  It turns out the table we want is indexed as 3.**

In [4]:
zip_tables[3].head(10)

Unnamed: 0,0,1,2,3,4,5,6
0,#,Zip Code,Location,City,Population,% Blacks,National Rank
1,101.,31714,"31.720230, -83.693798","Ashburn, Georgia",6938,50.18 %,"#1,290"
2,102.,30429,"32.160670, -81.939957","Hagan, Georgia",568,49.82 %,"#1,307"
3,103.,30441,"32.797066, -82.224962","Midville, Georgia",2034,49.70 %,"#1,311"
4,104.,31405,"32.034832, -81.164132","Savannah, Georgia",32887,49.36 %,"#1,328"
5,105.,31803,"32.304765, -84.521477","Buena Vista, Georgia",4706,49.15 %,"#1,335"
6,106.,30438,"32.165611, -82.020521","Manassas, Georgia",222,48.64 %,"#1,350"
7,107.,31078,"32.694909, -84.074538","Roberta, Georgia",3658,48.60 %,"#1,357"
8,108.,30456,"32.973657, -81.817412","Sardis, Georgia",2044,48.18 %,"#1,383"
9,109.,31779,"31.112560, -84.180066","Pelham, Georgia",9344,47.97 %,"#1,393"


# Need to set first row as the column names

In [5]:
col_names = zip_tables[3].loc[0,:]

In [6]:
rename_dict = {}

for i, name in enumerate(zip_tables[3].columns):
    rename_dict[name] = col_names[i]

In [7]:
black_zip_codes_df = zip_tables[3].copy()
black_zip_codes_df.rename(columns = rename_dict, inplace=True)
black_zip_codes_df.drop([0], inplace=True)
black_zip_codes_df.head(10)

Unnamed: 0,#,Zip Code,Location,City,Population,% Blacks,National Rank
1,101.0,31714,"31.720230, -83.693798","Ashburn, Georgia",6938,50.18 %,"#1,290"
2,102.0,30429,"32.160670, -81.939957","Hagan, Georgia",568,49.82 %,"#1,307"
3,103.0,30441,"32.797066, -82.224962","Midville, Georgia",2034,49.70 %,"#1,311"
4,104.0,31405,"32.034832, -81.164132","Savannah, Georgia",32887,49.36 %,"#1,328"
5,105.0,31803,"32.304765, -84.521477","Buena Vista, Georgia",4706,49.15 %,"#1,335"
6,106.0,30438,"32.165611, -82.020521","Manassas, Georgia",222,48.64 %,"#1,350"
7,107.0,31078,"32.694909, -84.074538","Roberta, Georgia",3658,48.60 %,"#1,357"
8,108.0,30456,"32.973657, -81.817412","Sardis, Georgia",2044,48.18 %,"#1,383"
9,109.0,31779,"31.112560, -84.180066","Pelham, Georgia",9344,47.97 %,"#1,393"
10,110.0,31643,"30.770770, -83.558433","Quitman, Georgia",9387,47.74 %,"#1,402"


**Are the population values strings or integers?**

In [8]:
type(black_zip_codes_df.loc[3, "Population"])

str

**Need to cast the Population column as integers**

In [9]:
black_zip_codes_df['Population'] = black_zip_codes_df['Population'].astype(int)

In [10]:
black_zip_codes_df['Population'].describe()

count      100.000000
mean      9034.420000
std      10652.452133
min        222.000000
25%       1913.750000
50%       4551.000000
75%      11907.000000
max      42232.000000
Name: Population, dtype: float64

**figure out where to threshold the zip codes so there's sufficient population to contact**

In [12]:
pop_mean = black_zip_codes_df['Population'].mean()
pop_sd =  black_zip_codes_df['Population'].std()

one_k = len(black_zip_codes_df[black_zip_codes_df['Population'] > 1000])
five_k = len(black_zip_codes_df[black_zip_codes_df['Population'] > 5000])

print(f"mean: {pop_mean}. std = {pop_sd}.")
print(f"total population list: {len(black_zip_codes_df)}")
print(f"total size, subtracting zip codes with less than 1000 people: {one_k}.")
print(f"total size, subtracting zip codes with less than 2500 people: {len(black_zip_codes_df[black_zip_codes_df['Population'] > 2500])}.")
print(f"total size, subtracting zip codes with less than 5000 people: {five_k}.")


mean: 9034.42. std = 10652.452133104416.
total population list: 100
total size, subtracting zip codes with less than 1000 people: 87.
total size, subtracting zip codes with less than 2500 people: 65.
total size, subtracting zip codes with less than 5000 people: 47.


In [13]:
indices_to_drop_5k = black_zip_codes_df.index[black_zip_codes_df['Population']<=5000]
black_zip_codes_df.drop(indices_to_drop_5k, inplace=True)


In [14]:
black_zip_codes_df.head()

Unnamed: 0,#,Zip Code,Location,City,Population,% Blacks,National Rank
1,101.0,31714,"31.720230, -83.693798","Ashburn, Georgia",6938,50.18 %,"#1,290"
4,104.0,31405,"32.034832, -81.164132","Savannah, Georgia",32887,49.36 %,"#1,328"
9,109.0,31779,"31.112560, -84.180066","Pelham, Georgia",9344,47.97 %,"#1,393"
10,110.0,31643,"30.770770, -83.558433","Quitman, Georgia",9387,47.74 %,"#1,402"
11,111.0,31211,"32.912983, -83.624917","Macon, Georgia",15876,47.49 %,"#1,414"


In [66]:
len(black_zip_codes_df)

47

# We decided to filter the zip codes by total black population, so we're calculating that, then filtering the dataframe by total black population, descending.

In [16]:
black_zip_codes_df['% Blacks'] = black_zip_codes_df['% Blacks'].apply(lambda x: float(x[:-2])/100)
black_zip_codes_df["Black Population"] = black_zip_codes_df["Population"]* black_zip_codes_df["% Blacks"]
black_zip_codes_df.sort_values(by=['Black Population'], ascending=False, inplace=True)

In [19]:
black_zip_codes_df.head(20)

Unnamed: 0,index,#,Zip Code,Location,City,Population,% Blacks,National Rank,Black Population
0,51,151.0,31061,"33.059367, -83.221587","Milledgeville, Georgia",39231,0.417,"#1,748",16359.327
1,4,104.0,31405,"32.034832, -81.164132","Savannah, Georgia",32887,0.4936,"#1,328",16233.0232
2,66,166.0,31707,"31.525260, -84.301465","Albany, Georgia",41502,0.3879,"#1,936",16098.6258
3,36,136.0,31313,"31.865374, -81.577900","Hinesville, Georgia",35885,0.4431,"#1,602",15900.6435
4,81,181.0,30236,"33.521612, -84.328432","Jonesboro, Georgia",42232,0.3648,"#2,102",15406.2336
5,78,178.0,30223,"33.287652, -84.274587","Griffin, Georgia",34765,0.3742,"#2,031",13009.063
6,40,140.0,30904,"33.478289, -82.014102","Augusta, Georgia",28323,0.4334,"#1,649",12275.1882
7,65,165.0,30297,"33.612121, -84.373929","Forest Park, Georgia",29757,0.3889,"#1,931",11572.4973
8,60,160.0,31021,"32.493414, -82.943064","Dublin, Georgia",27457,0.402,"#1,839",11037.714
9,89,189.0,30008,"33.897634, -84.589572","Marietta, Georgia",31330,0.352,"#2,194",11028.16


In [18]:
black_zip_codes_df = black_zip_codes_df.reset_index()

In [22]:
black_zip_codes_df.columns

Index(['Zip Code', 'Location', 'City', 'Population', '% Blacks',
       'National Rank', 'Black Population'],
      dtype='object')

In [21]:
black_zip_codes_df.drop(columns=['index', '#'], inplace=True)

In [23]:
black_zip_codes_df.head()

Unnamed: 0,Zip Code,Location,City,Population,% Blacks,National Rank,Black Population
0,31061,"33.059367, -83.221587","Milledgeville, Georgia",39231,0.417,"#1,748",16359.327
1,31405,"32.034832, -81.164132","Savannah, Georgia",32887,0.4936,"#1,328",16233.0232
2,31707,"31.525260, -84.301465","Albany, Georgia",41502,0.3879,"#1,936",16098.6258
3,31313,"31.865374, -81.577900","Hinesville, Georgia",35885,0.4431,"#1,602",15900.6435
4,30236,"33.521612, -84.328432","Jonesboro, Georgia",42232,0.3648,"#2,102",15406.2336


In [24]:
black_zip_codes_df.loc[21:,:]

Unnamed: 0,Zip Code,Location,City,Population,% Blacks,National Rank,Black Population
21,30308,"33.770877, -84.377873","Atlanta, Georgia",11796,0.4634,"#1,480",5466.2664
22,31408,"32.126446, -81.201570","Savannah, Georgia",11131,0.4739,"#1,421",5274.9809
23,31791,"31.528624, -83.894073","Sylvester, Georgia",12927,0.385,"#1,954",4976.895
24,31029,"33.034238, -83.928676","Forsyth, Georgia",13839,0.3555,"#2,164",4919.7645
25,30401,"32.611263, -82.351410","Swainsboro, Georgia",12624,0.3772,"#2,007",4761.7728
26,30273,"33.582010, -84.270013","Rex, Georgia",11412,0.4165,"#1,751",4753.098
27,31779,"31.112560, -84.180066","Pelham, Georgia",9344,0.4797,"#1,393",4482.3168
28,31643,"30.770770, -83.558433","Quitman, Georgia",9387,0.4774,"#1,402",4481.3538
29,30204,"33.060720, -84.130363","Barnesville, Georgia",12240,0.3486,"#2,225",4266.864
30,30642,"33.576400, -83.207308","Greensboro, Georgia",9885,0.4181,"#1,734",4132.9185


# Use Yelp API to Download Businesses from the zip codes we want
Info on requesting API key and search parameters here: https://www.yelp.com/developers/documentation/v3/get_started
Tutorial here: https://python.gotrained.com/yelp-fusion-api-tutorial/
https://www.yelp.com/developers

In [25]:
client_id = 'my_client_id'
api_key = 'my_api_key'
yelp_url = 'https://api.yelp.com/v3'
business_search_url = 'https://api.yelp.com/v3/businesses/search'

headers = {'Authorization': 'Bearer %s' % api_key}

business_dict = {}

#If we want to use rows other than the first 20 (or however many), we'll need to change the line below.
for index, row in black_zip_codes_df.loc[21:,:].iterrows():
    
    location = row['City'].replace("Georgia", "GA") + " " + row['Zip Code']

    params = {'location': location, 'limit': 50, 'offset': 0}
    r1=requests.get(business_search_url, params=params, headers=headers)
    first_50 = json.loads(r1.text)

    #don't know if i need to slow down the api calls.  I haven't found documentation to say I do, but I'm just being cautious
    time.sleep(random.randint(50,150)/100)

    params_2 = {'location': location, 'limit': 50, 'offset': 50}
    r2=requests.get(business_search_url, params=params_2, headers=headers)
    next_50 = json.loads(r2.text)

    names = []
    review_counts = []  
    ratings = []
    listed_zip_codes = []
    categories = []
    
    for business in first_50['businesses']:
        categories_str = ''
        names.append(business['name'])
        review_counts.append(business['review_count'])
        ratings.append(business['rating'])
        listed_zip_codes.append(business['location']['zip_code'])
        for category in business['categories']:
            categories_str += category['title'] + '; '
        categories.append(categories_str)

    for business in next_50['businesses']:
        categories_str = ''
        names.append(business['name'])
        review_counts.append(business['review_count'])
        ratings.append(business['rating'])
        listed_zip_codes.append(business['location']['zip_code'])
        for category in business['categories']:
            categories_str += category['title'] + '; '
        categories.append(categories_str)

    business_dict[row['Zip Code']] = {"names": names, "review_counts": review_counts, "ratings": ratings, 
                                      "listed_zip_codes": listed_zip_codes, "categories": categories}      



In [28]:
business_dict['30308']

{'names': ["Poor Calvin's",
  "Mary Mac's Tea Room",
  'Aviva by Kameel - Atlanta',
  'Atlanta Breakfast Club',
  'South City Kitchen Midtown',
  'Herban Fix - Vegan Kitchen',
  "Papi's Cuban & Caribbean Grill",
  'Georgia Aquarium',
  'Two Urban Licks',
  'The Vortex Bar And Grill - Midtown',
  'Cypress Street Pint & Plate',
  'Highland Bakery',
  'Ecco Midtown',
  'Torched Hop Brewing',
  'Flying Biscuit Café - Midtown',
  "Gus's World Famous Fried Chicken",
  'Alma Cocina - Downtown',
  'The Food Shoppe',
  'Joy Cafe',
  'Bon Ton',
  'Sweet Hut Bakery & Cafe',
  'Babs',
  'Barcelona Inman Park',
  'Piedmont Park',
  'Botiwalla',
  'National Center For Civil and Human Rights',
  'Negril ATL',
  'World of Coca-Cola',
  'Kevin Rathbun Steak',
  'Superica',
  'BeetleCat',
  'Bulla Gastrobar',
  'Atlanta Botanical Garden',
  '26 Thai Kitchen & Bar - Midtown',
  'The Lawrence',
  'Dancing Goats Coffee Bar',
  'Red Phone Booth',
  "Ray's in the City",
  'BoccaLupo',
  'Rreal Tacos',
  'Hon

# Double check: do the keys of business_dict match up with the zip codes in descending order of Black Population?

In [29]:
black_zip_codes_df['Zip Code']

0     31061
1     31405
2     31707
3     31313
4     30236
5     30223
6     30904
7     30297
8     31021
9     30008
10    30458
11    31792
12    31093
13    30260
14    31015
15    30241
16    31211
17    30824
18    31501
19    30213
20    30467
21    30308
22    31408
23    31791
24    31029
25    30401
26    30273
27    31779
28    31643
29    30204
30    30642
31    30313
32    30673
33    31315
34    31714
35    30442
36    30453
37    31096
38    31833
39    30817
40    31092
41    30002
42    31320
43    30457
44    31816
45    31331
46    31305
Name: Zip Code, dtype: object

In [30]:
business_dict.keys()

dict_keys(['30308', '31408', '31791', '31029', '30401', '30273', '31779', '31643', '30204', '30642', '30313', '30673', '31315', '31714', '30442', '30453', '31096', '31833', '30817', '31092', '30002', '31320', '30457', '31816', '31331', '31305'])

# Looks like it, yes

# Boiling down the listings from Yelp into one dictionary that shows how frequently the names turned up in our Yelp api calls.  

In [33]:
all_names = []
yelp_category_dict = {}
all_categories = []
#all_review_counts = []
#all_ratings = []
#all_listed_zip_codes = []
#all_categories = []

#I don't think I ever used this dictioary.  But, in case it throws an error later, I'm keeping it commented. 
#location_attributes = {}

for zip_code in business_dict:
    all_names += business_dict[zip_code]['names']
    all_categories += business_dict[zip_code]['categories']
#    all_review_counts += business_dict[zip_code]['review_counts']
#    all_ratings += business_dict[zip_code]['ratings']
#    all_listed_zip_codes += business_dict[zip_code]['listed_zip_codes']
#    all_categories += business_dict[zip_code]['categories']

yelp_category_dict = {name: category for name, category in zip(all_names, all_categories)}
name_frequency = {x: all_names.count(x) for x in all_names}




In [44]:
len(all_names)

1027

In [61]:
len(name_frequency)

782

# Filter out chains before scraping IG

In [38]:
kaggle_restaurants = pd.read_csv(r'FastFoodRestaurants.csv')
kaggle_restaurants.head()


Unnamed: 0,address,city,country,keys,latitude,longitude,name,postalCode,province,websites
0,324 Main St,Massena,US,us/ny/massena/324mainst/-1161002137,44.9213,-74.89021,McDonald's,13662,NY,"http://mcdonalds.com,http://www.mcdonalds.com/..."
1,530 Clinton Ave,Washington Court House,US,us/oh/washingtoncourthouse/530clintonave/-7914...,39.53255,-83.44526,Wendy's,43160,OH,http://www.wendys.com
2,408 Market Square Dr,Maysville,US,us/ky/maysville/408marketsquaredr/1051460804,38.62736,-83.79141,Frisch's Big Boy,41056,KY,"http://www.frischs.com,https://www.frischs.com..."
3,6098 State Highway 37,Massena,US,us/ny/massena/6098statehighway37/-1161002137,44.95008,-74.84553,McDonald's,13662,NY,"http://mcdonalds.com,http://www.mcdonalds.com/..."
4,139 Columbus Rd,Athens,US,us/oh/athens/139columbusrd/990890980,39.35155,-82.09728,OMG! Rotisserie,45701,OH,"http://www.omgrotisserie.com,http://omgrotisse..."


In [39]:
kaggle_restaurant_names = kaggle_restaurants['name'].unique().tolist()


In [40]:
name_frequency = {key:val for key, val in name_frequency.items() if key not in kaggle_restaurant_names}

In [41]:
len(name_frequency)

782

# Logging into IG & collecting IG handles

If you're unfamiliar with Selenium, here's a good place to start: https://selenium-python.readthedocs.io/.
There's an executable you have to download and put in your working directory - a quick google search will get you there.  Or I think if you run this code without the .exe file, the error tells you where to download it from.

In [95]:
def start_selenium(): 
    chrome_options = webdriver.ChromeOptions(); 
    chrome_options.add_experimental_option("excludeSwitches", ['enable-automation']);
    return webdriver.Chrome(r'path_to_chromedriver',
                              options=chrome_options);  

In [77]:
def slow_type(element, message):
    for char in message:
        element.send_keys(char)
        time.sleep(random.randint(5,25)/100)

In [98]:
def IG_login(driver):

    #ig login code:
    username = "ig_username"
    password = "ig_password"

    #instagram login
    #driver = webdriver.Chrome()
    driver.get("https://www.instagram.com/")

    driver.implicitly_wait(20) 
    
    try:
        element = WebDriverWait(driver, 13).until(EC.presence_of_element_located((By.NAME, "username")))
    except TimeoutException:
        print("couldn't find username textbox")

    input_username = driver.find_element_by_css_selector('#loginForm > div > div:nth-child(1) > div > label > input')
    slow_type(input_username, username)
    
    time.sleep(random.uniform(1,3.3))

    input_password = driver.find_element_by_css_selector('#loginForm > div > div:nth-child(2) > div > label > input')
    slow_type(input_password, password)

    time.sleep(random.uniform(.5,2.5))
    driver.find_element_by_css_selector('#loginForm > div > div:nth-child(3) > button').click()

#    driver.implicitly_wait(2)

    #driver.implicitly_wait(8)
    #if len(driver.find_elements_by_xpath('//*[@id="react-root"]/section/main/div/div/div/div/button')) > 0:
    #    driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/div/div/div/button').click()

    try:
        element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[5]/div/div/div/div[3]/button[2]')))
        driver.find_element_by_xpath('/html/body/div[5]/div/div/div/div[3]/button[2]').click()
    except TimeoutException:
        print("popup 2 didn't show")

    driver.implicitly_wait(5)

    try:
        element = WebDriverWait(driver, 13).until(EC.presence_of_element_located((By.XPATH, '//*[@id="react-root"]/section/nav/div[2]/div/div/div[2]/input')))
    except TimeoutException:
        print("Can't find search bar.  Not sure what to do...")


# Scrape IG Handles

In [116]:
# get started 

driver = start_selenium()

time.sleep(2)

IG_login(driver)

dt0 = datetime.now()
#collect the IG handles, urls, etc.     
data_list = []
#for name in name_frequency:
names_list = list(name_frequency.keys())

for name in names_list:
    search_box = driver.find_element_by_xpath('//*[@id="react-root"]/section/nav/div[2]/div/div/div[2]/input')
    #search_box.send_keys(name)
    slow_type(search_box, name)
    time.sleep(random.randint(250,450)/100)
    #first_profile = driver.find_element(By.CSS_SELECTOR, "yCE8d.JvDyy")
    try:
        first_profile = driver.find_element_by_xpath('//*[@id="react-root"]/section/nav/div[2]/div/div/div[2]/div[3]/div/div[2]/div/div[2]/a')

        url = first_profile.get_attribute("href") 

        #['names', 'review_counts', 'ratings', 'listed_zip_codes', 'categories']
        data_list.append([name, name_frequency[name], url.split('/')[-2], url])
    except NoSuchElementException:
        data_list.append([name, name_frequency[name], '', "nothing appeard in IG search"])

    search_box.clear()
    time.sleep(random.randint(113,369)/100)

print("Elapsed time to find ", len(name_frequency), " IG handles and urls: ", str(datetime.now() - dt0))

influencers_df = pd.DataFrame(data_list, columns = ["name", "yelp_search_frequency", "ig_handle", "ig_url"])
driver.close()

Name:  Two Urban Licks
found username:  the-swing-overlook-of-downtown-atlanta-skyline
Name:  The Vortex Bar And Grill - Midtown
found username:  thevortexbarandgrill
Name:  Cypress Street Pint & Plate
found username:  cypress-street-pint-and-plate
Name:  Highland Bakery
found username:  highlandbakery
Name:  Ecco Midtown
found username:  ecco-midtown
Name:  Torched Hop Brewing
found username:  torchedhopbrewing
Name:  Flying Biscuit Café - Midtown
found username:  flying-biscuit-cafe-midtown
Name:  Gus's World Famous Fried Chicken
found username:  guss-world-famous-fried-chicken
Name:  Alma Cocina - Downtown
found username:  alma-restaurant-b61-bar
Name:  The Food Shoppe
found username:  thefoodshoppett
Name:  Joy Cafe
found username:  joycafeatl
Name:  Bon Ton
found username:  bontonstudio
Name:  Sweet Hut Bakery & Cafe
found username:  mysweethutbnc
Name:  Babs
found username:  babs
Name:  Barcelona Inman Park
found username:  barcelonainmanparkatl
Elapsed time to find  782  IG hand

In [163]:
influencers_df.head()

Unnamed: 0,name,yelp_search_frequency,ig_handle,ig_url
0,Two Urban Licks,1,the-swing-overlook-of-downtown-atlanta-skyline,https://www.instagram.com/explore/locations/24...
1,The Vortex Bar And Grill - Midtown,2,thevortexbarandgrill,https://www.instagram.com/explore/tags/thevort...
2,Cypress Street Pint & Plate,2,cypress-street-pint-and-plate,https://www.instagram.com/explore/locations/36...
3,Highland Bakery,1,highlandbakery,https://www.instagram.com/highlandbakery/
4,Ecco Midtown,2,ecco-midtown,https://www.instagram.com/explore/locations/79...


# Remove duplicate handles

In [119]:
influencers_df.drop_duplicates(subset=['ig_handle'], inplace=True, ignore_index=True)

# How to test for influence?
* TDS: Engagement = (Total Number of Comments + Total Number of Likes) / (Number of Posts * Number of Followers) * 100
    * https://medium.com/@kevinjnguyen/using-python-to-calculate-instagram-engagement-percentage-subtle-clothing-collection-99284dc750c2


# Remove bad links (links that don't point to a profile) and rows with no links.

In [120]:
print("influencers_df length pre-culling: ", len(influencers_df))

#i think you can save a step and drop any url with "explore" in it.  Check this out. 
indices_to_drop = influencers_df.index[influencers_df['ig_url'].str.contains("/explore/")]
profiles_df = influencers_df.drop(indices_to_drop)

#tag_indices_to_drop = influencers_df.index[influencers_df['ig_url'].str.contains("explore/tags")]
#profiles_df.drop(tag_indices_to_drop, inplace=True)

na_indices_to_drop = influencers_df.index[influencers_df['ig_handle'].isna()]
profiles_df.drop(na_indices_to_drop, inplace=True)

empty_string_indices_to_drop = profiles_df.index[profiles_df['ig_handle']=='']
profiles_df.drop(empty_string_indices_to_drop, inplace=True)

print("profiles_df length post-culling: ", len(profiles_df))

influencers_df length pre-culling:  15
profiles_df length post-culling:  5


# Tasks:
* Flag private profiles and profiles with no posts to remove later.
* Fuzzy match the yelp name and the IG name so we can threshold out bad names.  
* Collect full_name, number of followers, number of posts, category_name, business_category_name, overall_category_name

In [124]:
profiles_df['yelp_category'] = profiles_df['name'].apply(lambda x: yelp_category_dict[x])

In [125]:
profiles_df

Unnamed: 0,name,yelp_search_frequency,ig_handle,ig_url,yelp_category
0,Highland Bakery,1,highlandbakery,https://www.instagram.com/highlandbakery/,Bakeries; Breakfast & Brunch; Sandwiches;
1,The Food Shoppe,2,thefoodshoppett,https://www.instagram.com/thefoodshoppett/,Cajun/Creole; Breakfast & Brunch;
2,Joy Cafe,1,joycafeatl,https://www.instagram.com/joycafeatl/,Breakfast & Brunch; Cafes; Southern;
3,Bon Ton,2,bontonstudio,https://www.instagram.com/bontonstudio/,Cajun/Creole; Seafood; Tiki Bars;
4,Sweet Hut Bakery & Cafe,2,mysweethutbnc,https://www.instagram.com/mysweethutbnc/,Coffee & Tea; Bakeries; Sandwiches;


In [122]:
profiles_df.reset_index(inplace=True, drop=True)
profiles_df.head()

Unnamed: 0,name,yelp_search_frequency,ig_handle,ig_url
0,Highland Bakery,1,highlandbakery,https://www.instagram.com/highlandbakery/
1,The Food Shoppe,2,thefoodshoppett,https://www.instagram.com/thefoodshoppett/
2,Joy Cafe,1,joycafeatl,https://www.instagram.com/joycafeatl/
3,Bon Ton,2,bontonstudio,https://www.instagram.com/bontonstudio/
4,Sweet Hut Bakery & Cafe,2,mysweethutbnc,https://www.instagram.com/mysweethutbnc/


# Now, to gather data on the IG accounts

In [142]:
def preprocess_str(name_str):
    new_str = name_str.lower()
    new_str = re.sub(r'&.*;', ' ', new_str)
    new_str = re.sub(r'\s+', ' ', new_str)
    words = word_tokenize(new_str)
    words = [word for word in words if word not in string.punctuation]
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [144]:
def compare_fuzzy_strings(name_1, name_2):
    name_1 = preprocess_str(name_1)
    name_2 = preprocess_str(name_2)
    
    return fuzz.ratio(name_1, name_2)


In [149]:
#Influencer sores for bad profiles - private, no posts, etc. - will be -1.  Someone could get an influencer score
#of 0, and this differentiates between them.  
def scrape_ig_accounts(df):
    full_names = []
    total_followers = []
    total_posts = []
    category_names = []
    business_category_names = []
    overall_category_names = []
    influencer_scores = []
    string_match_scores = []

    magic_token = '?__a=1'

    driver = start_selenium()

    #if selenium's not running
    IG_login(driver)

    #for index, row in profiles_df.iterrows():
    for index, row in df.iterrows():

        driver.get(row['ig_url'] + magic_token)

        first_bracket = driver.page_source.find('{')
        last_bracket = driver.page_source.rfind('}')

        ig_profile = json.loads(driver.page_source[first_bracket:last_bracket+1])

        if len(ig_profile.keys()) == 0:
            full_names.append("Deleted Account") 
            total_followers.append(0) 
            total_posts.append(0) 
            category_names.append('null')
            business_category_names.append('null')
            overall_category_names.append('null')
            string_match_scores.append(0)
            influencer_scores.append(-1)

        elif ig_profile['graphql']['user']['is_private']:
            full_names.append("Private Account") 
            total_followers.append(0) 
            total_posts.append(0) 
            category_names.append('null')
            business_category_names.append('null')
            overall_category_names.append('null')
            string_match_scores.append(0)
            influencer_scores.append(-1)

        else:
            engagement_list = []

            if ig_profile['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
                full_names.append("No Followers") 
                total_followers.append(0) 
                total_posts.append(0) 
                category_names.append('null')
                business_category_names.append('null')
                overall_category_names.append('null')
                string_match_scores.append(0)
                influencer_scores.append(-1)
            else: 
                for edge in ig_profile['graphql']['user']['edge_owner_to_timeline_media']['edges']:
                    engagement_list.append(edge['node']['edge_liked_by']['count'] + edge['node']['edge_media_to_comment']['count'])

                engagement_avg = sum(engagement_list)/len(engagement_list)

                followers = ig_profile['graphql']['user']['edge_followed_by']['count']

                full_names.append(ig_profile['graphql']['user']['full_name']) 
                total_followers.append(followers) 
                total_posts.append(ig_profile['graphql']['user']['edge_owner_to_timeline_media']['count']) 
                category_names.append(ig_profile['graphql']['user']['category_name'])
                business_category_names.append(ig_profile['graphql']['user']['business_category_name'])
                overall_category_names.append(ig_profile['graphql']['user']['overall_category_name'])

                string_match_scores.append(compare_fuzzy_strings(ig_profile['graphql']['user']['full_name'], row['name']))

                if followers > 0:
                    influencer_scores.append(round((engagement_avg/followers)*100,2))
                else:
                    influencer_scores.append(-1)

        time.sleep(random.randint(50,450)/100)

    driver.close()    

    df['influencer_score'] = influencer_scores
    df['ig_full_name'] = full_names
    df['name_match_score'] = string_match_scores
    df['total_followers'] = total_followers
    df['total_posts'] = total_posts
    df['category_name'] = category_names
    df['business_category_name'] = business_category_names
    df['overall_category_name'] = overall_category_names


In [150]:
scrape_ig_accounts(profiles_df)

# See if it came back alright - looks like it did!

In [164]:
profiles_df.head()

Unnamed: 0,name,yelp_search_frequency,ig_handle,ig_url,yelp_category,influencer_score,ig_full_name,name_match_score,total_followers,total_posts,category_name,business_category_name,overall_category_name
1,The Food Shoppe,2,thefoodshoppett,https://www.instagram.com/thefoodshoppett/,Cajun/Creole; Breakfast & Brunch;,3.23,The Food Shoppe,100,532,79,Food &amp; Beverage,Food &amp; Personal Goods,
3,Bon Ton,2,bontonstudio,https://www.instagram.com/bontonstudio/,Cajun/Creole; Seafood; Tiki Bars;,1.45,BON TON STUDIO | home + baby,45,7778,1092,Shopping &amp; Retail,Personal Goods &amp; General Merchandise Stores,
0,Highland Bakery,1,highlandbakery,https://www.instagram.com/highlandbakery/,Bakeries; Breakfast & Brunch; Sandwiches;,1.24,Highland Bakery &amp; Kitchen,79,3335,421,,Restaurants,
2,Joy Cafe,1,joycafeatl,https://www.instagram.com/joycafeatl/,Breakfast & Brunch; Cafes; Southern;,0.57,Joy Cafe,100,10435,649,Restaurant,Restaurants,


# Post-process, filter, and export the installment


In [154]:
def post_process_output_IG_metrics(df, file_name):
    print("Starting length: ", len(df))
    #drop influencer scores == -1
    indices_to_drop = df.index[df['influencer_score']==-1]
    df.drop(indices_to_drop, inplace=True)


    #account has more than 500 followers
    indices_to_drop = df.index[df['total_followers']<500]
    df.drop(indices_to_drop, inplace=True)

    print("Final length: ", len(df))

    df.sort_values(by=['influencer_score'], ascending=False, inplace=True)

    df.to_excel(file_name, index=False)

In [None]:
filepath = r'path_to_file_location'

post_process_output_IG_metrics(profiles_df, os.path.join(filepath, 'IG handles to reach out to Aug 14.xlsx') )
