## Querying Twitter API
In this notebook we will use the usernames gained from the previous data analysis, and query Twitter for user data

In [1]:
import os
import re
import datetime as dt
import time
import random
import json

# data science
import math
import numpy as np
import pandas as pd

In [2]:
# configurations

# Allow multiple outputs for each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# display all columns
pd.set_option('display.max_columns', None)

# suppress auto-conversion to scientific notation
pd.set_option('display.precision', 6)

# Twitter

In [9]:
import tweepy
import config

Source: https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-users-lookup
<div>
    <br>
    <img src='Images/lookup.png' style="width:800px">
    <br>
    <img src='Images/lookup2.png' style="width:400px">
</div>

## Obtaining User Access Token
Our user dataset contains 249,844 unique usernames. Under the limit of 300, it would take too long to get all the user data. Obtaining user access token will triple the limit. (See screenshot above)

Source: http://kinocksebastian.blogspot.com/2015/04/how-to-get-access-token-using-tweepy-in.html

In [28]:
# # authenticating twitter consumer key
# auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_secret)
# auth.secure=True
# authUrl = auth.get_authorization_url()

# # go to this url
# print("Please Visit This link and authorize the app ==> " + authUrl)
# print("Enter The Authorization PIN")

In [29]:
# pin = "8470794"

# # set authorization PIN
# token = auth.get_access_token(verifier=pin)

In [30]:
# # write access tokens to file
# accessTokenFile = open("config.py","a")
# accessTokenFile.write(f'user_access_token = "{token[0]}"\n')
# accessTokenFile.write(f'user_access_token_secret = "{token[1]}"\n')
# accessTokenFile.close()

## App Authentication

Switched from app authentication to user authentication to increase the API request limit

In [31]:
# app auth
# auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_secret)
# auth.set_access_token(config.access_token, config.access_token_secret)

# user auth
auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_secret)
auth.set_access_token(config.user_access_token, config.user_access_token_secret)

In [32]:
# Construct the API instance
api = tweepy.API(auth)

## Query Twitter for User Data

In [10]:
users = pd.read_csv('Data/users.csv')

In [33]:
# this function will query Twitter via User/Lookup method 100 users per request, and make 900 requests
# after 900 requests, the function will sleep for 15 minutes as not to exceed the API limit

def queryTwitter(df, start=0, cols):
    end = start + 100
    while True:
        for i in range(899):
            print(f'downloading: {start} ~ {end}')
            
            # query for user objects
            q = df.iloc[start:end]['screenName'].tolist()
            user_objects = api.lookup_users(screen_names=q)
            
            # store all returned user objects in dict; screenName as key and data as value
            users = {user.screen_name:user._json for user in user_objects}
            
            dfx = pd.DataFrame()
            dfx['screenName'] = [name for name in users.keys()]
            for col in cols:
                dfx[col] = [user[col] for user in users.values()]

            with open('Data/user_data.csv', 'a') as f:
                if os.path.isfile('Data/user_data.csv'):
                    dfx.to_csv(f, index=False, header=False)
                else:
                    dfx.to_csv(f, index=False)

            # "If a requested user is unknown, suspended, or deleted [...] not be returned.."
            # see screenshot above
            # note users that weren't found
            notFound = [missing for missing in q if missing not in users.keys()]

            with open('Data/not_found.txt', 'a') as txtfile:
                for i in notFound:
                    txtfile.write(str(i) + '\n')
            
            print('\n'.join([user.screen_name for user in user_objects]), '\n')
            
            start += 100
            if end == len(df):
                print(f'end: {end}')
                return
            if (end + 100) <= len(df):
                end += 100
            elif (end + 100) > len(df):
                end = len(df)
        print(f'Entering sleep: {dt.datetime.now()}\n')
        time.sleep(15 * 60)

In [12]:
len(users)
users.drop_duplicates(inplace=True)
len(users)

241003

241003

In [35]:
cols = ["id_str", "name", "screen_name", "location", "description", "url", "followers_count", 
        "friends_count", "created_at", "favourites_count", "verified", "statuses_count", "lang"]

In [36]:
queryTwitter(users, 0, cols)

---download starting---
downloading: 0 ~ 100
ahmediaTV
DemerisePotvin
TheDawnStott
scottygirl2014
silveriaalison
calrican
traugott_sarah
SchifanoRaelene
BrunusCutis
RDonSteele
_standeliver
european_4
FrancescaBiller
emilyjoypoetry
frozenjo
num1_fan
DougsMom204
page_lie
1mimi4ever
justshocked
ProphetPhella
JessLivMo
collegeclasses5
J_R_Kestner
nvygrl1
GLucivero
IndigoRave
roadgearsun
AliBerlow
SWSupportLV
missremington
rachelndeleon
Meamoore1391
RibaSusan
ChanaDoreaux
BJFrezell
CatherinesIdaho
BekLiza
spikybluealien
cnntoday
javerriaawan
jicReneeMerling
CDNHero
KarenCallesoe
TamDawg79
stcks4
krakmcr
TeeMichelle57
JerusalemFury
downrightdpad
BaptizeFeminism
CCWC_
adelesammarco
NathaliaVasquez
TeutaIllyri
shnikies78
ctlss
Imlooney2M
brooketaggart
wendy08724
icarusfactor
Doris49360681
CamRooMom
iamthatgirl
plainpersin
klkolz
tealraspberry
AnitaSurridge
shinefiercely
emergingW
jandennis1955
BasedAlcatraz
blaiserbeam
savvysuzee
mzelma
4seasonspix
L2MyDaughter
TimCushing
BitchMusic
aaronMCN
A

# Check output

In [5]:
file = open('Data/not_found.txt', 'r', newline='\n')
missing = file.read()
missing = missing.split('\n')

In [6]:
print(f'users not found:{len(missing)}\n')
print('\n'.join(missing))

users not found:106232

JesusPrepper74
ForEverBrenn
dbehan79
VirginiusPrimus
womanontheleft
JC4ever
BRellator
genesis427427
ThalesLives
IE_ANTIFA
Anonfernow
havanaftells
realchrisdantes
CleanGirlSoap
PeterDa22559210
rkeough
ruthayyy
SocialNet_Newz
mysteri00013322
youraposer
sharyalabluff
juno814
darkhorse0102
MiamiBestOf
Benjamin_fredd
LibertarianQn
dirtywater22
ironmanKFF
HunterAwilliam
SuitsGraphics
NewsJunki3
rockykistner1
mellgibsom
NewbieER12
heathermagrath7
ImacatchuS
_LCMB_
lea_majesta
sucks_cnn
HearMyVoice45
SexyJesusX2
emmontoya_
patrick_hilmar
DPSisler
realMichalDinal
ArcJoanof
Berning2020
AutomobilN
morgainmcgovern
PamBiancoUSN
Ignorantconfide
hillbillyspider
CheriRamone
persiStanceinc
FourStarNews
_OffPitch
wbeer
JennyGu_Yingjie
HeSaidThat2
SnapsxxCom
pornhubviid
NeenNeenloom
drays56
AnaPatchouli
renee_papesh
Ronni_see
Twinkle_Towne
DBGrinberg
_Rabble_Rabble
ShelleyResists
mark11_15_
sansanguzman
drewgioh
Andrea8008135
WalkerKorea
CelebNewsFast
DrHarryLyme
TrendStyleDaily
A

In [7]:
missing = pd.DataFrame(missing, columns=['usersNotFound'])

Unnamed: 0,usersNotFound
0,JesusPrepper74
1,ForEverBrenn
2,dbehan79
3,VirginiusPrimus
4,womanontheleft
5,JC4ever
6,BRellator
7,genesis427427
8,ThalesLives
9,IE_ANTIFA


In [8]:
len(missing)
missing.dropna(inplace=True)
len(missing)

106232

106232

In [9]:
missing.to_csv('Data/not_found.csv',index=False)

In [13]:
df = pd.read_csv('Data/user_data.csv', encoding='utf-8')
df.head()

Unnamed: 0,screenName,id_str,name,screen_name,location,description,url,followers_count,friends_count,created_at,favourites_count,verified,statuses_count,lang
0,ahmediaTV,29976609,"Ahmed Tharwat, ثروت",ahmediaTV,https://huzzaz.com/collection/,independent TV Host/producer \nhttps://t.co/mi...,https://t.co/JMRyd5shPM,1084,1031,Thu Apr 09 12:50:17 +0000 2009,1921,False,40471,en
1,DemerisePotvin,936019403367419909,DemiP.,DemerisePotvin,,,,1,31,Wed Nov 29 23:49:51 +0000 2017,12,False,35,en
2,TheDawnStott,612216797,Dawn Stott,TheDawnStott,America,nothing up my sleeve,http://t.co/5FMszbb5yu,487,1075,Tue Jun 19 02:59:59 +0000 2012,7979,False,11848,en
3,scottygirl2014,856719098,Debra - Siren Sword of Truth,scottygirl2014,Titanic’s Tomb,Paralegal; Activist/Writer; Investigative Rese...,https://t.co/3QQGxz00F6,3410,5002,Mon Oct 01 17:22:33 +0000 2012,123706,False,103238,en
4,silveriaalison,819111315721494528,Go Float Yourself,silveriaalison,"Springfield, OR","VETERAN, Writer, Love is my religion! Living w...",https://t.co/5MrCiBGDCE,1722,1557,Wed Jan 11 09:18:50 +0000 2017,31516,False,25325,en


In [14]:
len(df)
df.drop_duplicates(inplace=True, subset='screenName', keep='last')
len(df)

87245905

211839

In [15]:
df.reset_index(drop=True, inplace=True)
df.head(20)

Unnamed: 0,screenName,id_str,name,screen_name,location,description,url,followers_count,friends_count,created_at,favourites_count,verified,statuses_count,lang
0,rougbaisers,190672282,j’aime,rougbaisers,Quelque part,"Je suis ici avec vous, pour l'instant",,1723,2417,Tue Sep 14 15:10:30 +0000 2010,1747,False,204451,en
1,CalumSPlath,61073310,Calum Sherwood,CalumSPlath,London (E17) / Gateshead (NE9),Sylvia Plath enthusiast. Originally from the N...,https://t.co/yELmZKm2Y0,5708,2176,Wed Jul 29 01:10:37 +0000 2009,71775,False,99881,en
2,scfuckers,880227723494477829,Scfuckers.com,scfuckers,add my snapchat - DirectPorn,visit https://t.co/EYipnQWzZl if you are serio...,https://t.co/5qswIAS5fJ,13462,0,Thu Jun 29 00:53:37 +0000 2017,0,False,43457,en
3,shoegal27,20514782,Karina Thomas,shoegal27,,B2B marketer and part time #DDMIX instructor w...,https://t.co/ladqhZ5Dyw,174,392,Tue Feb 10 14:04:49 +0000 2009,1773,False,688,en
4,BasedHowardBeal,888919789065973761,Jeremy Maddux,BasedHowardBeal,,"Host of The Quiet Place\n\nChaotic Neutral, De...",,138,736,Sun Jul 23 00:32:47 +0000 2017,4544,False,4202,en
5,SANEvents2017,368962215,Book Sophia Nelson,SANEvents2017,Nationwide,2018 is here! It's time to Go from your Now to...,https://t.co/qUaJMvwrFX,1041,117,Tue Sep 06 14:41:00 +0000 2011,1947,False,12122,en
6,Cathy_Harri,2392922407,Cathy Harris,Cathy_Harri,,Slightly less curious then a cat.,,602,929,Sun Mar 16 15:52:49 +0000 2014,11697,False,12942,en
7,lucycarin,18765167,lucycarin,lucycarin,"Ponca City, OK",It isn't braggin if you can do it. I’m a new O...,,1248,380,Thu Jan 08 14:38:31 +0000 2009,170422,False,112047,en
8,christine_w56,782959125076414464,"Christine Whybrow:& 𝔶𝔢𝔱, ℑ 𝔭𝔢𝔯𝔰𝔦𝔰𝔱!",christine_w56,,Electrical Engineer(HONS). Now studying Comp.S...,,741,691,Mon Oct 03 15:02:56 +0000 2016,21158,False,24816,en
9,HasiNasiBB,919986694597644295,K. A.,HasiNasiBB,"California, U.S.A.",,,1,3,Mon Oct 16 18:01:35 +0000 2017,0,False,26,en


In [16]:
df.to_csv('Data/users2.csv', index=False, encoding='utf-8')