# Twitter API Scraping

### Packages to load

In [57]:
import tweepy
import configparser
import pandas as pd
import numpy as np
import json
from twython import Twython

### Other stuff to load

In [58]:
# nothing

### Loading credentials and Twython authentication

In [59]:
with open("../../../config_files/credentials.json") as infile:
    credentials = json.load(infile)

twitter_client = Twython(credentials["api_key"],
                         credentials["api_secret"],
                         credentials["access_token"],
                         credentials["access_token_secret"])

### Data gathering test from Twython

In [60]:
# trying to gather from Twython

keywords = '#iPhone14Pro'

twitter_client.search(q=keywords)

{'statuses': [{'created_at': 'Thu Sep 15 14:57:52 +0000 2022',
   'id': 1570426680224935937,
   'id_str': '1570426680224935937',
   'text': 'RT @SPIDEReGAMES: #CONCOURS \n🎁 Tentez de gagner un IPhone 14 Pro !\n\nPour participer : Like + RT + follow \n@SPIDEReGAMES \n@levraizarkx &amp; @Ma…',
   'truncated': False,
   'entities': {'hashtags': [{'text': 'CONCOURS', 'indices': [18, 27]}],
    'symbols': [],
    'user_mentions': [{'screen_name': 'SPIDEReGAMES',
      'name': 'Spider Games',
      'id': 1100741938184142850,
      'id_str': '1100741938184142850',
      'indices': [3, 16]},
     {'screen_name': 'SPIDEReGAMES',
      'name': 'Spider Games',
      'id': 1100741938184142850,
      'id_str': '1100741938184142850',
      'indices': [106, 119]},
     {'screen_name': 'levraizarkx',
      'name': 'ZarKx \uea00',
      'id': 1550562701243518977,
      'id_str': '1550562701243518977',
      'indices': [121, 133]}],
    'urls': []},
   'metadata': {'iso_language_code': 'fr', 'result_typ

### Loading credentials and Tweepy authentication

In [61]:
# read configs

config = configparser.ConfigParser()
config.read('../../../config_files/config.ini')

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

In [62]:
# authentication

auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

### Data gathering from Twitter API

#### Option 1

In [7]:
# search by user

# user = 'veritasium'
# limit=300
# tweets = tweepy.Cursor(api.user_timeline, screen_name=user, count=200, tweet_mode='extended').items(limit)

# search by keyword or hashtag (works too for @user)
keywords = "#iPhone14Pro -filter:retweets"
limit = 10000
tweets = tweepy.Cursor(
    api.search_tweets, q=keywords, count=100, lang="en", tweet_mode="extended").items(limit)

# create DataFrame
columns = [
    "User",
    "Tweet_time",
    "Tweet",
    "Hashtag",
    "Retweet",
    "Favorite",
    "Join_time",
    "Follower",
    "Friend",
]

data = []

for tweet in tweets:
    data.append(
        [
            tweet.user.screen_name,
            tweet.created_at,
            tweet.full_text,
            tweet.entities["hashtags"],
            tweet.retweet_count,
            tweet.favorite_count,
            tweet.user.created_at,
            tweet.user.followers_count,
            tweet.user.friends_count,
        ]
    )

df = pd.DataFrame(data, columns=columns)

df.head()

Unnamed: 0,User,Tweet_time,Tweet,Hashtag,Retweet,Favorite,Join_time,Follower,Friend
0,mivhunter26,2022-09-15 13:38:05+00:00,This is what I’m excited about today: \n\n#iPh...,"[{'text': 'iPhone14Pro', 'indices': [40, 52]},...",0,0,2008-03-28 06:29:59+00:00,135,271
1,ElartistaKha,2022-09-15 13:37:25+00:00,"I like older women, attracted to older woman, ...","[{'text': 'shirt', 'indices': [164, 170]}, {'t...",0,0,2021-10-19 11:18:03+00:00,0,8
2,Moto760,2022-09-15 13:36:40+00:00,"Well, finally! I got a ship notification. Too...","[{'text': 'Apple', 'indices': [64, 70]}, {'tex...",0,0,2020-04-20 12:27:35+00:00,1275,283
3,MrGfUnK,2022-09-15 13:35:07+00:00,My #iPhone14ProMax &amp; #iPhone14Pro have bee...,"[{'text': 'iPhone14ProMax', 'indices': [3, 18]...",0,0,2009-10-31 20:33:37+00:00,333,411
4,_Apps4World_,2022-09-15 13:32:41+00:00,🚀 Launcher for iOS 16 Lock Screen 🔥\n\nFull Sw...,"[{'text': 'iPhone14Pro', 'indices': [88, 100]}...",0,0,2021-10-10 14:15:29+00:00,16,0


#### Option 2

In [8]:
# gathering data from API

keywords = '#iPhone14Pro'
limit = 10000

tweets = tweepy.Cursor(api.search_tweets, q=keywords, lang='en', count=100,
                       tweet_mode='extended').items(limit)

# creating DataFrame

columns = ['User', 'DateTime', 'Tweet']
data = []

for tweet in tweets:
    data.append([tweet.user.screen_name, tweet.created_at, tweet.full_text])
    
df = pd.DataFrame(data, columns=columns)

df.head()

Unnamed: 0,User,DateTime,Tweet
0,mivhunter26,2022-09-15 13:38:05+00:00,This is what I’m excited about today: \n\n#iPh...
1,osanagikazami,2022-09-15 13:37:57+00:00,RT @explicitgzz: COURTSIDE OUT NOW ON ALL PLAT...
2,yamamiharuna,2022-09-15 13:37:57+00:00,RT @explicitgzz: COURTSIDE OUT NOW ON ALL PLAT...
3,ElartistaKha,2022-09-15 13:37:25+00:00,"I like older women, attracted to older woman, ..."
4,Moto760,2022-09-15 13:36:40+00:00,"Well, finally! I got a ship notification. Too..."


### Test

In [63]:
# RUN THIS IS IT WORKED
# df.to_csv('../data/raw/TEST50K.csv

### Reading DataFrame for iPhone 14 Pro

In [9]:
#df.to_csv('../data/raw/iPhone14Prox10k_RAW_2.csv)

In [10]:
df14pro = pd.read_csv('../data/raw/iPhone14Prox10k_RAW.csv')
df14pro.head()

Unnamed: 0.1,Unnamed: 0,User,DateTime,Tweet
0,0,1202tung,2022-09-14 14:48:27+00:00,RT @insanetweet: iPhone 8 while upgrading to i...
1,1,Rushike74038427,2022-09-14 14:48:05+00:00,I want this case @Mrwhosetheboss \n#iphone14pr...
2,2,papsobu,2022-09-14 14:47:40+00:00,RT @insanetweet: iPhone 8 while upgrading to i...
3,3,_kayrozenlang,2022-09-14 14:47:27+00:00,RT @btslabs_global: 🏆Win iPhone14 Pro\n\nWith ...
4,4,SarenDebnath,2022-09-14 14:46:41+00:00,RT @btslabs_global: 🏆Win iPhone14 Pro\n\nWith ...


In [11]:
df14pro['User'].value_counts()

thebbmafrica       63
ChronicallyTG      41
FaizanA17120449    40
XXFemale1          39
sgmobmart          34
                   ..
p_929303            1
bonxcviii           1
twtfnov             1
axososad            1
Getpaidrealquic     1
Name: User, Length: 8703, dtype: int64

### Cleaning Tweet Column

In [12]:
df14pro['Tweet'][0]

'RT @insanetweet: iPhone 8 while upgrading to iOS 16\n\n#iOS16 #iPhone14 #iPhone14Pro \n https://t.co/IOpAaBnHus'

In [13]:
"""import re
import string

text = df14pro['Tweet'][0]
remove = string.punctuation
remove = remove.replace('#', '')
pattern = r"[{}]".format(remove)

re.sub(pattern, '', text)"""

'import re\nimport string\n\ntext = df14pro[\'Tweet\'][0]\nremove = string.punctuation\nremove = remove.replace(\'#\', \'\')\npattern = r"[{}]".format(remove)\n\nre.sub(pattern, \'\', text)'

In [14]:
# removing punctuations and lowercase normalization
import string
text = df14pro['Tweet'][0]
text_clean1 = ''.join([i.lower() for i in text if i not in string.punctuation])
text_clean1

'rt insanetweet iphone 8 while upgrading to ios 16\n\nios16 iphone14 iphone14pro \n httpstcoiopaabnhus'

In [15]:
# removing \n
import re
text_clean2 = re.sub('\n', ' ' , text_clean1)
text_clean2

'rt insanetweet iphone 8 while upgrading to ios 16  ios16 iphone14 iphone14pro   httpstcoiopaabnhus'

In [16]:
# removing url
text_clean3 = result = re.sub(r'http\S+', '', text_clean2)
text_clean3

'rt insanetweet iphone 8 while upgrading to ios 16  ios16 iphone14 iphone14pro   '

In [17]:
# importing stopwords
import nltk
from nltk.corpus import stopwords
cachedStopWords = stopwords.words('english')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [18]:
# removing stopwords
text_clean4 = ' '.join([word for word in text_clean3.split() if word not in stopwords.words('english')])
print(text)

RT @insanetweet: iPhone 8 while upgrading to iOS 16

#iOS16 #iPhone14 #iPhone14Pro 
 https://t.co/IOpAaBnHus


In [19]:
# counting single words within a string
def word_count(str):
    counts = dict()
    words = str.split()
    
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    return counts

print(word_count(text_clean4))

{'rt': 1, 'insanetweet': 1, 'iphone': 1, '8': 1, 'upgrading': 1, 'ios': 1, '16': 1, 'ios16': 1, 'iphone14': 1, 'iphone14pro': 1}


### Creating boolean for retweets

In [20]:
df14pro.head()

Unnamed: 0.1,Unnamed: 0,User,DateTime,Tweet
0,0,1202tung,2022-09-14 14:48:27+00:00,RT @insanetweet: iPhone 8 while upgrading to i...
1,1,Rushike74038427,2022-09-14 14:48:05+00:00,I want this case @Mrwhosetheboss \n#iphone14pr...
2,2,papsobu,2022-09-14 14:47:40+00:00,RT @insanetweet: iPhone 8 while upgrading to i...
3,3,_kayrozenlang,2022-09-14 14:47:27+00:00,RT @btslabs_global: 🏆Win iPhone14 Pro\n\nWith ...
4,4,SarenDebnath,2022-09-14 14:46:41+00:00,RT @btslabs_global: 🏆Win iPhone14 Pro\n\nWith ...


In [21]:
df14pro['Retweet'] = df14pro['Tweet'].str.contains('RT @')
df14pro

Unnamed: 0.1,Unnamed: 0,User,DateTime,Tweet,Retweet
0,0,1202tung,2022-09-14 14:48:27+00:00,RT @insanetweet: iPhone 8 while upgrading to i...,True
1,1,Rushike74038427,2022-09-14 14:48:05+00:00,I want this case @Mrwhosetheboss \n#iphone14pr...,False
2,2,papsobu,2022-09-14 14:47:40+00:00,RT @insanetweet: iPhone 8 while upgrading to i...,True
3,3,_kayrozenlang,2022-09-14 14:47:27+00:00,RT @btslabs_global: 🏆Win iPhone14 Pro\n\nWith ...,True
4,4,SarenDebnath,2022-09-14 14:46:41+00:00,RT @btslabs_global: 🏆Win iPhone14 Pro\n\nWith ...,True
...,...,...,...,...,...
9995,9995,techonthego,2022-09-10 08:54:17+00:00,Pre-order the amazing new iPhone 14 Pro on the...,False
9996,9996,PappannaG,2022-09-10 08:52:39+00:00,"RT @Drife_official: iPhone users, assemble?! 😅...",True
9997,9997,dynamo_mano,2022-09-10 08:52:24+00:00,"RT @Jazzy4Tech_: Yesterday, I went to Croma re...",True
9998,9998,spotter81au,2022-09-10 08:52:07+00:00,Perfect! #iPhone14pro #dynamicisland https://t...,False


In [22]:
df14pro['Retweet'].value_counts()

True     7967
False    2033
Name: Retweet, dtype: int64

In [23]:
df14pro['Tweet'][1]

'I want this case @Mrwhosetheboss \n#iphone14pro #iphone14procase #iphone14ProMax https://t.co/vkJB0EICT1'

In [24]:
df14pro['Retweet'][1]

False

In [25]:
df14pro['Tweet'] = df14pro['Tweet'].str.lower()
df14pro

Unnamed: 0.1,Unnamed: 0,User,DateTime,Tweet,Retweet
0,0,1202tung,2022-09-14 14:48:27+00:00,rt @insanetweet: iphone 8 while upgrading to i...,True
1,1,Rushike74038427,2022-09-14 14:48:05+00:00,i want this case @mrwhosetheboss \n#iphone14pr...,False
2,2,papsobu,2022-09-14 14:47:40+00:00,rt @insanetweet: iphone 8 while upgrading to i...,True
3,3,_kayrozenlang,2022-09-14 14:47:27+00:00,rt @btslabs_global: 🏆win iphone14 pro\n\nwith ...,True
4,4,SarenDebnath,2022-09-14 14:46:41+00:00,rt @btslabs_global: 🏆win iphone14 pro\n\nwith ...,True
...,...,...,...,...,...
9995,9995,techonthego,2022-09-10 08:54:17+00:00,pre-order the amazing new iphone 14 pro on the...,False
9996,9996,PappannaG,2022-09-10 08:52:39+00:00,"rt @drife_official: iphone users, assemble?! 😅...",True
9997,9997,dynamo_mano,2022-09-10 08:52:24+00:00,"rt @jazzy4tech_: yesterday, i went to croma re...",True
9998,9998,spotter81au,2022-09-10 08:52:07+00:00,perfect! #iphone14pro #dynamicisland https://t...,False


In [26]:
#from functools import reduce
#filters = [("#iphone14", False), ("#iphone14pro", True)]
#df14pro['#iphone14'] = reduce(lambda df14pro, f: df14pro[df14pro.str.contains(f[0]) == f[1]], filters, df14pro)

In [27]:
#df14pro['#iphone14'] = df14pro[(df14pro['#iphone14'].str.contains('#iphone14')) & (~df14pro['#iphone14'].str.contains('upgrading'))]

In [28]:
#df14pro['#iphone14'] = df14pro[(df14pro.str.contains('#iphone14') == True) and (df14pro.str.contains('#iphone14pro') == False)]
#df14pro

In [29]:
#df14pro['#iphone14'] = np.where(df14pro['Tweet'].eq('iphone14'), 'true', 'false')
#df14pro

In [30]:
#df14pro['#iphone14'].value_counts()

In [31]:
df14pro['#iphone14'] = df14pro['Tweet'].apply(lambda x: '#iphone14' in x and 'iphone14pro' not in x)
df14pro

Unnamed: 0.1,Unnamed: 0,User,DateTime,Tweet,Retweet,#iphone14
0,0,1202tung,2022-09-14 14:48:27+00:00,rt @insanetweet: iphone 8 while upgrading to i...,True,False
1,1,Rushike74038427,2022-09-14 14:48:05+00:00,i want this case @mrwhosetheboss \n#iphone14pr...,False,False
2,2,papsobu,2022-09-14 14:47:40+00:00,rt @insanetweet: iphone 8 while upgrading to i...,True,False
3,3,_kayrozenlang,2022-09-14 14:47:27+00:00,rt @btslabs_global: 🏆win iphone14 pro\n\nwith ...,True,False
4,4,SarenDebnath,2022-09-14 14:46:41+00:00,rt @btslabs_global: 🏆win iphone14 pro\n\nwith ...,True,False
...,...,...,...,...,...,...
9995,9995,techonthego,2022-09-10 08:54:17+00:00,pre-order the amazing new iphone 14 pro on the...,False,False
9996,9996,PappannaG,2022-09-10 08:52:39+00:00,"rt @drife_official: iphone users, assemble?! 😅...",True,False
9997,9997,dynamo_mano,2022-09-10 08:52:24+00:00,"rt @jazzy4tech_: yesterday, i went to croma re...",True,False
9998,9998,spotter81au,2022-09-10 08:52:07+00:00,perfect! #iphone14pro #dynamicisland https://t...,False,False


In [32]:
df14pro['#iphone14'].value_counts()

False    9879
True      121
Name: #iphone14, dtype: int64

In [33]:
df14pro['#iphone14pro'] = df14pro['Tweet'].str.contains('#iphone14pro')
df14pro

Unnamed: 0.1,Unnamed: 0,User,DateTime,Tweet,Retweet,#iphone14,#iphone14pro
0,0,1202tung,2022-09-14 14:48:27+00:00,rt @insanetweet: iphone 8 while upgrading to i...,True,False,True
1,1,Rushike74038427,2022-09-14 14:48:05+00:00,i want this case @mrwhosetheboss \n#iphone14pr...,False,False,True
2,2,papsobu,2022-09-14 14:47:40+00:00,rt @insanetweet: iphone 8 while upgrading to i...,True,False,True
3,3,_kayrozenlang,2022-09-14 14:47:27+00:00,rt @btslabs_global: 🏆win iphone14 pro\n\nwith ...,True,False,True
4,4,SarenDebnath,2022-09-14 14:46:41+00:00,rt @btslabs_global: 🏆win iphone14 pro\n\nwith ...,True,False,True
...,...,...,...,...,...,...,...
9995,9995,techonthego,2022-09-10 08:54:17+00:00,pre-order the amazing new iphone 14 pro on the...,False,False,True
9996,9996,PappannaG,2022-09-10 08:52:39+00:00,"rt @drife_official: iphone users, assemble?! 😅...",True,False,True
9997,9997,dynamo_mano,2022-09-10 08:52:24+00:00,"rt @jazzy4tech_: yesterday, i went to croma re...",True,False,True
9998,9998,spotter81au,2022-09-10 08:52:07+00:00,perfect! #iphone14pro #dynamicisland https://t...,False,False,True


In [43]:
df14pro['#iphone14pro'].value_counts()

True     8103
False    1897
Name: #iphone14pro, dtype: int64

In [54]:
df = df14pro[df14pro['#iphone14pro'] == False]
df

Unnamed: 0.1,Unnamed: 0,User,DateTime,Tweet,Retweet,#iphone14,#iphone14pro
10,10,kamanmpo,2022-09-14 14:43:33+00:00,rt @ksmolka: who has an iphone 14 pro right no...,True,False,False
12,12,MrAyitey,2022-09-14 14:41:51+00:00,rt @ksmolka: who has an iphone 14 pro right no...,True,False,False
20,20,parvaneh172,2022-09-14 14:38:13+00:00,rt @ksmolka: who has an iphone 14 pro right no...,True,False,False
23,23,EkeneChinedum,2022-09-14 14:37:45+00:00,rt @olacokers: apple have successfully given m...,True,False,False
36,36,D4_concept,2022-09-14 14:32:58+00:00,rt @kellypencil1: i wonder why this my work di...,True,False,False
...,...,...,...,...,...,...,...
9958,9958,JosephKorex,2022-09-10 09:11:50+00:00,rt @pinetworkark: breaking news! \n\na merchan...,True,False,False
9968,9968,Beardedelf1,2022-09-10 09:05:30+00:00,rt @kellypencil1: i wonder why this my work di...,True,False,False
9986,9986,mi_konstantin,2022-09-10 08:57:38+00:00,rt @ld_vova: i redesigned the iphone 14 pro wa...,True,False,False
9989,9989,theoddgirl18,2022-09-10 08:56:36+00:00,rt @idevicehelpus: ios 16 rc &amp; final relea...,True,False,False


In [56]:
df['Tweet'][10]

'rt @ksmolka: who has an iphone 14 pro right now? need to test this on device asap!\n\n🏝️ hit the island - our game concept for iphone 14 pro,…'

### Overlap between sets