# Twitter Data Fetch
https://developer.twitter.com/en/docs/twitter-api/early-access

### Import relevant libraries

In [1]:
import requests
import os
import json
import re

import time
import datetime

from collections import defaultdict

import matplotlib.pyplot as plt
import pandas as pd

### Set static variables

In [4]:
bearer_token = #insert bearer token
file_path = r"C:\Users\jawo19ad\Dropbox (CBS)\Master thesis data\Twitter Fetch"
search_url = "https://api.twitter.com/2/tweets/search/all"
refugee_qualifier = "(refugee OR refugees OR migrant OR migrants OR immigrant OR immigrants OR (asylum (seeker OR seekers)) OR ((displaced OR stateless) (people OR person OR persons)))"

### Define Functions

In [5]:
def create_headers(bearer_token):
    headers = {"Authorization": f"Bearer {bearer_token}"}
    return headers


def connect_to_endpoint(url, headers, params):
    response = requests.request("GET", url, headers=headers, params=params)
    print(response.status_code)
    #print(response.headers)
    if response.status_code == 429:
        time_to_sleep = float(response.headers["x-rate-limit-reset"]) - time.time()
        print(f"Rate limit reached. Sleeping for {time_to_sleep} seconds...")
        time.sleep(time_to_sleep)
        
        response = requests.request("GET", url, headers=headers, params=params)
        print(response.status_code)
        
    if response.status_code == 503:
        time_to_sleep = 500
        print("Service Unavailable exception caught. Sleeping for 5 minutes then trying again.")
        
        response = requests.request("GET", url, headers=headers, params=params)
        print(response.status_code)
    
    elif response.status_code != 200:
        raise Exception(response.status_code, response.text)
          
    return response.json()

def write_to_json(response):
    
    with open(file, "a") as json_file:
        json_file.write(f"{json.dumps(response)}\n")

def main():
    headers = create_headers(bearer_token)
    json_response = connect_to_endpoint(search_url, headers, query_params)
    write_to_json(json_response)
    #response_data_norm, response_includes_norm = create_csv(json_response)
    
    try:
        query_params["next_token"] = json_response["meta"]["next_token"]
        print(f'Next Token: {query_params["next_token"]}')
        time.sleep(1)
    
    except KeyError:
        global next_page
        next_page = False
        print("No further tokens available. End of Process.")
        
def get_hashtag_counts(json):
    
    hashtag_count = defaultdict(int)

    for tweet in json["data"]:
        try:
            hashtags = tweet["entities"]["hashtags"]
            for hashtag in hashtags:
                hashtag_count[hashtag["tag"]] +=1
        except KeyError:
            continue
            
    return hashtag_count

def get_annotation_counts(json):
    
    annotation_count = defaultdict(int)

    for tweet in json["data"]:
        try:
            annotations = tweet["entities"]["annotations"]
            for annotation in annotations:
                annotation_count[annotation["normalized_text"]] +=1
        except KeyError:
            continue
            
    return annotation_count

def load_full_json(file):
    
    json_complete = {"data":list(),"users":list()}
    
    with open(file, "r") as json_file:
        for line in json_file:
            data = json.loads(line)
            json_complete["data"].extend(data["data"])
            json_complete["users"].extend(data["includes"]["users"])
    
    return json_complete

# General Query

In [31]:
next_token = None
next_page = True
ctr = 0

query_params = {'query': f'{refugee_qualifier} -is:retweet lang:en',
                'start_time': "2019-12-01T00:00:00Z",
                'end_time': "2020-01-01T00:00:00.00Z",
                'max_results': 500, #max value is 500
                'tweet.fields': 'id,text,author_id,created_at,entities,geo,in_reply_to_user_id,lang,public_metrics,referenced_tweets,source,withheld',
                'expansions': 'author_id,geo.place_id',
                'user.fields': 'id,name,username,created_at,description,entities,location,public_metrics,url,verified',
                'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                'next_token': next_token}

event_name = r"\all_refugee_12"
file_ID = f'{event_name}_{query_params["start_time"]}_{query_params["end_time"]}.txt'.replace(":","-")
file = file_path + file_ID


print("Starting...")
while next_page == True:
    main()
    ctr += 1
    print(f'Retrieved Tweets: {query_params["max_results"] * ctr}')

Starting...
200
Next Token: b26v89c19zqg8o3fo6yevi58t4fl1ign30g6a3lqijwql
Retrieved Tweets: 500
200
Next Token: b26v89c19zqg8o3fo6yevi4y0of1nqw7vsyh2gmqljm9p
Retrieved Tweets: 1000
200
Next Token: b26v89c19zqg8o3fo6yevi4n9s3g3m6yg85t18tn0x94t
Retrieved Tweets: 1500
200
Next Token: b26v89c19zqg8o3fo6yevi4civk1nm44hpofrjss1o1vh
Retrieved Tweets: 2000
200
Next Token: b26v89c19zqg8o3fo6yevi41tgs6jpad351x4dtplxlrx
Retrieved Tweets: 2500
200
Next Token: b26v89c19zqg8o3fo6yevi3r2k8xh2eksrshyy5un81od
Retrieved Tweets: 3000
200
Next Token: b26v89c19zqg8o3fo6yevg1q2lpj4v0265jiq4qc53urh
Retrieved Tweets: 3500
200
Next Token: b26v89c19zqg8o3fo6yevg1fd6q1nr97culv02lwms5tp
Retrieved Tweets: 4000
200
Next Token: b26v89c19zqg8o3fo6yevg1f2kkvezr5cw310slj4cwot
Retrieved Tweets: 4500
200
Next Token: b26v89c19zqg8o3fo6yevg14d6nslm1ow5ww3puqiwgal
Retrieved Tweets: 5000
200
Next Token: b26v89c19zqg8o3fo6yevg0tntll8u1wac424gqcaykcd
Retrieved Tweets: 5500
200
Next Token: b26v89c19zqg8o3fo6yevg0izxg74lrsw1lilw

Retrieved Tweets: 48000
200
Next Token: b26v89c19zqg8o3fo6yegivzlj765hux4jm9hswhu7mnx
Retrieved Tweets: 48500
200
Next Token: b26v89c19zqg8o3fo6yeggtylk863z0vklxz20umrpmd9
Retrieved Tweets: 49000
200
Next Token: b26v89c19zqg8o3fo6yeggtnw5ocikkd4po63tlhr5b3x
Retrieved Tweets: 49500
200
Next Token: b26v89c19zqg8o3fo6yeggtniihyy91i2l3ut63vk1if1
Retrieved Tweets: 50000
200
Next Token: b26v89c19zqg8o3fo6yeggtcul2o619dx9gz122ytx3zx
Retrieved Tweets: 50500
200
Next Token: b26v89c19zqg8o3fo6yeggt256y01fobcisjtrit5dhml
Retrieved Tweets: 51000
200
Next Token: b26v89c19zqg8o3fo6yeggsrhbv1mlmhl4htoc5nt6we5
Retrieved Tweets: 51500
200
Next Token: b26v89c19zqg8o3fo6yeggsr6qcut3gksm0azjl4id18d
Retrieved Tweets: 52000
200
Next Token: b26v89c19zqg8o3fo6yeggsgkdwa54tmsrqm1nt7qazjx
Retrieved Tweets: 52500
200
Next Token: b26v89c19zqg8o3fo6yeggs5y0l3pbwpqb9e5b8478eil
Retrieved Tweets: 53000
200
Next Token: b26v89c19zqg8o3fo6yeggs5qe6xa6xh6slz6yn8ja3r1
Retrieved Tweets: 53500
200
Next Token: b26v89c19zqg8o

200
Next Token: b26v89c19zqg8o3fo6ye1fg9qferbg22lumrpi3wjs6t9
Retrieved Tweets: 96000
200
Next Token: b26v89c19zqg8o3fo6ye1ffz41gjardjtcm58gn9g6xdp
Retrieved Tweets: 96500
200
Next Token: b26v89c19zqg8o3fo6ye1ffytegim5ckftfaj9p7kpr3x
Retrieved Tweets: 97000
200
Next Token: b26v89c19zqg8o3fo6ye1ffo2ik8zj26fv7x9rsd8y7p9
Retrieved Tweets: 97500
200
Next Token: b26v89c19zqg8o3fo6ye1ffdbkyi40dkdnorlvp16okfx
Retrieved Tweets: 98000
200
Next Token: b26v89c19zqg8o3fo6ye1ff2j72rzehya4sqi1y81mkxp
Retrieved Tweets: 98500
200
Next Token: b26v89c19zqg8o3fo6ye1ferqqf0jw779t9t8ac4meu0t
Retrieved Tweets: 99000
200
Next Token: b26v89c19zqg8o3fo6ye1dcqqrg81d3hsp2n583ledqbh
Retrieved Tweets: 99500
200
Next Token: b26v89c19zqg8o3fo6ye1dcfybnd6pik4lmc07e2br7nh
Retrieved Tweets: 100000
200
Next Token: b26v89c19zqg8o3fo6ye1dc55wa0tbqv6no0cyqch6kxp
Retrieved Tweets: 100500
200
Next Token: b26v89c19zqg8o3fo6ye1dbudieapbts0u0sebtg3n7gd
Retrieved Tweets: 101000
200
Next Token: b26v89c19zqg8o3fo6ye1dbjjjjq45plfg6

Retrieved Tweets: 143000
200
Next Token: b26v89c19zqg8o3fo6vi26dymeb7jnh4hn23ykosaojul
Retrieved Tweets: 143500
200
Next Token: b26v89c19zqg8o3fo6vi26dyesrimhaxukk1d0f5d429p
Retrieved Tweets: 144000
200
Next Token: b26v89c19zqg8o3fo6vi24bxkvv5t6fneebevc75yxzlp
Retrieved Tweets: 144500
200
Next Token: b26v89c19zqg8o3fo6vi24bxdbe45r8wshk32x2p556d9
Retrieved Tweets: 145000
200
Next Token: b26v89c19zqg8o3fo6vi24bmsghtfn38mwnb50zz709z1
Retrieved Tweets: 145500
200
Next Token: b26v89c19zqg8o3fo6vi24bmje90ujz64lf7l1db4uc8t
Retrieved Tweets: 146000
200
Next Token: b26v89c19zqg8o3fo6vi24bbyipco3xjcy0g4kacdq9vh
Retrieved Tweets: 146500
200
Next Token: b26v89c19zqg8o3fo6vi24b1c694zjoc19jsqrgijp9q5
Retrieved Tweets: 147000
200
Next Token: b26v89c19zqg8o3fo6vi24b133de2e19dksyrdwupb9fh
Retrieved Tweets: 147500
200
Next Token: b26v89c19zqg8o3fo6vi24aqgpmq9vla9slv8vo60pf25
Retrieved Tweets: 148000
200
Next Token: b26v89c19zqg8o3fo6vi24aq7l93dcnt794ncqzybrv5p
Retrieved Tweets: 148500
200
Next Token: b2

Retrieved Tweets: 190000
200
Next Token: b26v89c19zqg8o3fo6vi1pdngxlhda2opfs2uygt8bd6l
Retrieved Tweets: 190500
200
Next Token: b26v89c19zqg8o3fo6vi1pdnate7cb5epwnbzp0irkbct
Retrieved Tweets: 191000
200
Next Token: b26v89c19zqg8o3fo6vi1pdcpzrs87nz2ljlzbfct7tz1
Retrieved Tweets: 191500
200
Next Token: b26v89c19zqg8o3fo6vi1pdcidt7h0zl199mluzitdpx9
Retrieved Tweets: 192000
200
Next Token: b26v89c19zqg8o3fo6vi1pd1w1zrno7jmtno99985077h
Retrieved Tweets: 192500
200
Next Token: b26v89c19zqg8o3fo6vi1pd1my1jz5g41qqwf3rtevg59
Retrieved Tweets: 193000
200
Next Token: b26v89c19zqg8o3fo6vi1pcr23zv0q7emups6wj5r2lj1
Retrieved Tweets: 193500
200
Next Token: b26v89c19zqg8o3fo6vi1pcr0lst8fi6npknlg2sib0jh
Retrieved Tweets: 194000
200
Next Token: b26v89c19zqg8o3fo6vi1pcquivqo0l4p10x1exsnr0xp
Retrieved Tweets: 194500
200
Next Token: b26v89c19zqg8o3fo6vi1pcg85zw85nzkekcyqwkpv531
Retrieved Tweets: 195000
200
Next Token: b26v89c19zqg8o3fo6vi1pcfz1mepgpztd7w8vckka9od
Retrieved Tweets: 195500
200
Next Token: b2

200
Next Token: b26v89c19zqg8o3fo6vhmq6hj6r1qg1l6jo5wcqf2kn7h
Retrieved Tweets: 237500
200
Next Token: b26v89c19zqg8o3fo6vhmq6ha1y8a2x3gis0qg2c7714t
Retrieved Tweets: 238000
200
Next Token: b26v89c19zqg8o3fo6vhmq66np2ajw2p48vmj8609e9od
Retrieved Tweets: 238500
200
Next Token: b26v89c19zqg8o3fo6vhmq5w1ctiqqpgvrqh43aucbb7h
Retrieved Tweets: 239000
200
Next Token: b26v89c19zqg8o3fo6vhmq5vsa5jm489vza5utflyeav1
Retrieved Tweets: 239500
200
Next Token: b26v89c19zqg8o3fo6vhmq5l7fgos0shz65tw8kdymt8d
Retrieved Tweets: 240000
200
Next Token: b26v89c19zqg8o3fo6vhmq5kzthwhe1gtlwea4hxxedfh
Retrieved Tweets: 240500
200
Next Token: b26v89c19zqg8o3fo6vhmq5adhw324yyh02l2cusj21z1
Retrieved Tweets: 241000
200
Next Token: b26v89c19zqg8o3fo6vhmq5a5w4tvkvn8b8x62nnk2r99
Retrieved Tweets: 241500
200
Next Token: b26v89c19zqg8o3fo6vhmq4zl1vkr3t4adcz02ocntjst
Retrieved Tweets: 242000
200
Next Token: b26v89c19zqg8o3fo6vhmq4zdgz98wi2dl771k6900399
Retrieved Tweets: 242500
200
Next Token: b26v89c19zqg8o3fo6vhmq4osnc

Retrieved Tweets: 284500
200
Next Token: b26v89c19zqg8o3fo6vh7v6dkpopz4qx3l27jbo9ba8hp
Retrieved Tweets: 285000
200
Next Token: b26v89c19zqg8o3fo6vh7v62vbk0qfl1tvunb8a5bd9q5
Retrieved Tweets: 285500
200
Next Token: b26v89c19zqg8o3fo6vh7v62j7059lobv0f10emy00rjx
Retrieved Tweets: 286000
200
Next Token: b26v89c19zqg8o3fo6vh7t41m9a37wjvf9s0opm3loi9p
Retrieved Tweets: 286500
200
Next Token: b26v89c19zqg8o3fo6vh7t3qyd4o1eeecgxwkcp04tnjx
Retrieved Tweets: 287000
200
Next Token: b26v89c19zqg8o3fo6vh7t3qp96oz278rgoty1tlhux6l
Retrieved Tweets: 287500
200
Next Token: b26v89c19zqg8o3fo6vh7t3g1eqr2tvfepubaswn915dp
Retrieved Tweets: 288000
200
Next Token: b26v89c19zqg8o3fo6vh7t35f1uuftxb8kvnotybhe7b1
Retrieved Tweets: 288500
200
Next Token: b26v89c19zqg8o3fo6vh7t355yc6aeecx3bzv6jzyhunx
Retrieved Tweets: 289000
200
Next Token: b26v89c19zqg8o3fo6vh7t2umks40oi65vfe8pb0q4325
Retrieved Tweets: 289500
200
Next Token: b26v89c19zqg8o3fo6vh7t2ugjs54hlyl3e59mlgbo67x
Retrieved Tweets: 290000
200
Next Token: b2

Retrieved Tweets: 331500
200
Next Token: b26v89c19zqg8o3fo6vh7igeyl8ao930232vao1mfkinx
Retrieved Tweets: 332000
200
Next Token: b26v89c19zqg8o3fo6vh7igesiqj0qx9a6k4dahsx8dbx
Retrieved Tweets: 332500
200
Next Token: b26v89c19zqg8o3fo6vh7ig4985tmk3l2ney0u71j56yl
Retrieved Tweets: 333000
200
Next Token: b26v89c19zqg8o3fo6vh7ig436b8xwxbcwq98ve8v1c3h
Retrieved Tweets: 333500
200
Next Token: b26v89c19zqg8o3fo6vh7iftju91k2bftb984s7ehqv0d
Retrieved Tweets: 334000
200
Next Token: b26v89c19zqg8o3fo6vh7iftf9qiuci4v177i34timwzh
Retrieved Tweets: 334500
200
Next Token: b26v89c19zqg8o3fo6vh7ift97oh6y6nabb42jnyyupod
Retrieved Tweets: 335000
200
Next Token: b26v89c19zqg8o3fo6vh7ifipwgo2v1er875j6h0njwxp
Retrieved Tweets: 335500
200
Next Token: b26v89c19zqg8o3fo6vh7ifilbykg6tgsd2cbisarn919
Retrieved Tweets: 336000
200
Next Token: b26v89c19zqg8o3fo6vh7ifigs35hqe3im4m8ozp2vz3x
Retrieved Tweets: 336500
200
Next Token: b26v89c19zqg8o3fo6vh7if7yyffs1sqnm4hkhs992hrx
Retrieved Tweets: 337000
200
Next Token: b2

200
Next Token: b26v89c19zqg8o3fo6vgspm8fr6zjven4mmuv3vk47a0t
Retrieved Tweets: 379000
200
Next Token: b26v89c19zqg8o3fo6vgspm8b6ouvj57nmakskr9w0w3h
Retrieved Tweets: 379500
200
Next Token: b26v89c19zqg8o3fo6vgspm86lyt02gevdwu1pvhdutfh
Retrieved Tweets: 380000
200
Next Token: b26v89c19zqg8o3fo6vgsplxnb6h8jjjmxmh6scwkslml
Retrieved Tweets: 380500
200
Next Token: b26v89c19zqg8o3fo6vgsplxh9jl3b545dh49c7v6l9bx
Retrieved Tweets: 381000
200
Next Token: b26v89c19zqg8o3fo6vgsplxcolxy5phxd4m9ialpchod
Retrieved Tweets: 381500
200
Next Token: b26v89c19zqg8o3fo6vgsplmuuy63gmh77ry7w49a8ad9
Retrieved Tweets: 382000
200
Next Token: b26v89c19zqg8o3fo6vgsplmnb4emsvfbeictcprbhh19
Retrieved Tweets: 382500
200
Next Token: b26v89c19zqg8o3fo6vgsplc5ho81y705kt6ncmprku4d
Retrieved Tweets: 383000
200
Next Token: b26v89c19zqg8o3fo6vgsplc0wy9h9wc3s9jc2t1998xp
Retrieved Tweets: 383500
200
Next Token: b26v89c19zqg8o3fo6vgsplbuvj52apvkacb00t7ph4zh
Retrieved Tweets: 384000
200
Next Token: b26v89c19zqg8o3fo6vgspl1d1g

Retrieved Tweets: 426000
200
Next Token: b26v89c19zqg8o3fo6vgsh1iuqqb4ng3kbjkf4plidvgd
Retrieved Tweets: 426500
200
Next Token: b26v89c19zqg8o3fo6vgsh188e9usue4a6y52ohgbuum5
Retrieved Tweets: 427000
200
Next Token: b26v89c19zqg8o3fo6vgsh180rvl51onqpwq9xmjytif1
Retrieved Tweets: 427500
200
Next Token: b26v89c19zqg8o3fo6vgsez75e9qpy16qnnlj17msktbx
Retrieved Tweets: 428000
200
Next Token: b26v89c19zqg8o3fo6vgseywkiqee6sdlf5oe7y07md19
Retrieved Tweets: 428500
200
Next Token: b26v89c19zqg8o3fo6vgseywbf7jrqj2fg0cx2pzzuhkt
Retrieved Tweets: 429000
200
Next Token: b26v89c19zqg8o3fo6vgseylnj25mgagffpezijukvxx9
Retrieved Tweets: 429500
200
Next Token: b26v89c19zqg8o3fo6vgseyay6n59lh5qpcl3oomool19
Retrieved Tweets: 430000
200
Next Token: b26v89c19zqg8o3fo6vgseyam1g2uidbf3i35bvbn1fnh
Retrieved Tweets: 430500
200
Next Token: b26v89c19zqg8o3fo6vgsexzy5ijqez7zbkfdirxumhz1
Retrieved Tweets: 431000
200
Next Token: b26v89c19zqg8o3fo6vgsexp8rdq6h4mzbscsae11swhp
Retrieved Tweets: 431500
200
Next Token: b2

Retrieved Tweets: 473000
200
Next Token: b26v89c19zqg8o3fo6vgdhunx0so8m9nx73bj6h0tu2yl
Retrieved Tweets: 473500
200
Next Token: b26v89c19zqg8o3fo6vgdhunpfh3zqjqowqrtzlkwqz5p
Retrieved Tweets: 474000
200
Next Token: b26v89c19zqg8o3fo6vgdhud342z2iy7pcz5mxqshro1p
Retrieved Tweets: 474500
200
Next Token: b26v89c19zqg8o3fo6vgdhu2gr72fj2deeui2cm13li7x
Retrieved Tweets: 475000
200
Next Token: b26v89c19zqg8o3fo6vgdhu29460x4uaqic2714j5n0xp
Retrieved Tweets: 475500
200
Next Token: b26v89c19zqg8o3fo6vgdhtrl8vh8g7py0bt1a5gv53zx
Retrieved Tweets: 476000
200
Next Token: b26v89c19zqg8o3fo6vgdhtgxeut5xq73wre44fbmd75p
Retrieved Tweets: 476500
200
Next Token: b26v89c19zqg8o3fo6vgdhtgmspltoh8hxmvc6shhu0l9
Retrieved Tweets: 477000
200
Next Token: b26v89c19zqg8o3fo6vgdfrfpuzn1em85za4n334bhym5
Retrieved Tweets: 477500
200
Next Token: b26v89c19zqg8o3fo6vgdfr50fsqqv4pzc4ikxyhkodq5
Retrieved Tweets: 478000
200
Next Token: b26v89c19zqg8o3fo6vgdfr4pt0htve4v1xdtort1vc71
Retrieved Tweets: 478500
200
Next Token: b2

In [8]:
json_complete = load_full_json(file)

In [10]:
df = pd.json_normalize(json_complete["data"])

In [13]:
df.shape

(907653, 22)

In [14]:
df.head()

Unnamed: 0,referenced_tweets,in_reply_to_user_id,id,source,author_id,created_at,lang,text,entities.mentions,entities.urls,...,public_metrics.like_count,public_metrics.quote_count,entities.annotations,entities.hashtags,geo.place_id,entities.cashtags,geo.coordinates.type,geo.coordinates.coordinates,withheld.copyright,withheld.country_codes
0,"[{'type': 'replied_to', 'id': '138828099577906...",2355254546,1388282011945635841,Twitter Web App,1100105913988452352,2021-04-30T23:59:59.000Z,en,@ZachG932 Was referring to separate data. \n\n...,"[{'start': 0, 'end': 9, 'username': 'ZachG932'}]","[{'start': 249, 'end': 272, 'url': 'https://t....",...,0,0,,,,,,,,
1,"[{'type': 'replied_to', 'id': '138826549999844...",2361224263,1388282010481926145,Twitter for iPhone,133064856,2021-04-30T23:59:58.000Z,en,@DailySignal @Heritage Using immigrants to fur...,"[{'start': 0, 'end': 12, 'username': 'DailySig...",,...,0,0,,,,,,,,
2,"[{'type': 'replied_to', 'id': '138822635060398...",2973870195,1388281984066142211,Twitter for iPhone,30561192,2021-04-30T23:59:52.000Z,en,@RepBuddyCarter @HouseGOP He hasn’t in all thi...,"[{'start': 0, 'end': 15, 'username': 'RepBuddy...",,...,0,0,"[{'start': 73, 'end': 77, 'probability': 0.981...",,,,,,,
3,"[{'type': 'replied_to', 'id': '138827850929928...",1980011,1388281972410126337,TweetDeck,745699970905341953,2021-04-30T23:59:49.000Z,en,@Random_Factor @KorpsPropaganda I... shit you'...,"[{'start': 0, 'end': 14, 'username': 'Random_F...",,...,6,0,,,,,,,,
4,"[{'type': 'replied_to', 'id': '138776016425466...",1108440621503647749,1388281961328922624,Twitter for iPad,17132840,2021-04-30T23:59:47.000Z,en,@tim_wheel @robreiner You aren’t a natural bor...,"[{'start': 0, 'end': 10, 'username': 'tim_whee...",,...,0,0,"[{'start': 211, 'end': 213, 'probability': 0.9...",,,,,,,


---

The code below is used to add new data to the overall dataset in a more time efficient manner

In [15]:
from tqdm import tqdm

entities = list()
for tweet in tqdm(json_complete["data"]):
    
    tweet_id = tweet["id"]
    
    try:
        hashtags = [hashtag["tag"].lower() for hashtag in tweet["entities"]["hashtags"]]
    except KeyError:
        hashtags = None
    
    try:
        mentions = [mention["username"].lower() for mention in tweet["entities"]["mentions"]]
    except KeyError:
        mentions = None
        
    try:
        annotations = [annotation["normalized_text"].lower() for annotation in tweet["entities"]["annotations"]]
    except KeyError:
        annotations = None
        
    entities.append([tweet_id,hashtags,mentions,annotations])

100%|██████████| 907653/907653 [00:08<00:00, 102607.16it/s]


In [16]:
df_entities = pd.DataFrame(entities,columns=["id","hashtags","mentions","annotations"])
df_entities.head()

Unnamed: 0,id,hashtags,mentions,annotations
0,1388282011945635841,,[zachg932],
1,1388282010481926145,,"[dailysignal, heritage]",
2,1388281984066142211,,"[repbuddycarter, housegop]","[biden, harris]"
3,1388281972410126337,,"[random_factor, korpspropaganda]",
4,1388281961328922624,,"[tim_wheel, robreiner]",[usa]


In [17]:
df_tweets_with_entities = pd.merge(left=df,
                                   right=df_entities,
                                   left_on="id",
                                   right_on="id",
                                   how="left")

In [19]:
df_tweets_with_entities.drop(['entities.mentions', 'entities.hashtags', 'entities.urls','entities.annotations',
                              'in_reply_to_user_id', 'referenced_tweets', 'geo.place_id', 'geo.coordinates.type',
                              'geo.coordinates.coordinates', 'withheld.copyright', 'withheld.country_codes', 'entities.cashtags'],
                              axis=1, inplace=True)

In [20]:
df_tweets_with_entities = df_tweets_with_entities.rename(columns={'public_metrics.retweet_count': 'retweet_count',
                                                                  'public_metrics.reply_count': 'reply_count',
                                                                  'public_metrics.like_count': 'like_count',
                                                                  'public_metrics.quote_count': 'quote_count'})

In [21]:
df_tweets_with_entities.shape

(907653, 13)

In [22]:
df_tweets_with_entities.head()

Unnamed: 0,id,source,author_id,created_at,lang,text,retweet_count,reply_count,like_count,quote_count,hashtags,mentions,annotations
0,1388282011945635841,Twitter Web App,1100105913988452352,2021-04-30T23:59:59.000Z,en,@ZachG932 Was referring to separate data. \n\n...,0,1,0,0,,[zachg932],
1,1388282010481926145,Twitter for iPhone,133064856,2021-04-30T23:59:58.000Z,en,@DailySignal @Heritage Using immigrants to fur...,0,0,0,0,,"[dailysignal, heritage]",
2,1388281984066142211,Twitter for iPhone,30561192,2021-04-30T23:59:52.000Z,en,@RepBuddyCarter @HouseGOP He hasn’t in all thi...,0,0,0,0,,"[repbuddycarter, housegop]","[biden, harris]"
3,1388281972410126337,TweetDeck,745699970905341953,2021-04-30T23:59:49.000Z,en,@Random_Factor @KorpsPropaganda I... shit you'...,0,1,6,0,,"[random_factor, korpspropaganda]",
4,1388281961328922624,Twitter for iPad,17132840,2021-04-30T23:59:47.000Z,en,@tim_wheel @robreiner You aren’t a natural bor...,0,0,0,0,,"[tim_wheel, robreiner]",[usa]


In [23]:
df_complete = pd.read_csv(r"C:\Users\jawo19ad\Dropbox (CBS)\Master thesis data\df_tweets.csv",
                        converters={"hashtags": lambda x: x.strip("[]").replace("'","").split(", "),
                                    "mentions": lambda x: x.strip("[]").replace("'","").split(", "),
                                    "annotations": lambda x: x.strip("[]").replace("'","").split(", ")})

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
df_complete.drop("Unnamed: 0", axis=1, inplace=True)

In [25]:
df_complete.shape

(11850723, 14)

In [26]:
df_complete.head()

Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,withheld.scope,hashtags,mentions,annotations
0,Twitter for Android,There are too many reliable reports and first-...,en,1350390669043499013,2021-01-16T10:33:19.000Z,1327278886380515328,1,0,0,0,,"[eritrea, bidentakeaction, stopwarontigray, ti...",[joebiden],[]
1,Twitter Web App,"Despite repeated requests, @Refugees and other...",en,1350390657576300544,2021-01-16T10:33:16.000Z,1323903491044188161,0,0,0,0,,"[tigray, stopwarontigray, tigraygenocide, bide...","[refugees, un, joebiden]","[shimelba, hitsats]"
2,Twitter for Android,"“When the air bombing and the attacks began, I...",en,1350390643986599937,2021-01-16T10:33:13.000Z,1324130252008816640,0,0,1,0,,[bidentakeaction],"[joebiden, kamalaharris, yohannesabraham]",[]
3,Twitter Web App,"""ongoing insecurity &amp; allegations of grave...",en,1350390618695020546,2021-01-16T10:33:07.000Z,1112761003,0,0,0,0,,"[tigraygenocide, bidenactnow, protectrefugees]","[un, joebiden, eu_commission, refugees, josepb...",[eritrea]
4,Twitter for iPhone,"MOTHER \n\nMother, I do not cry who cries, \nm...",en,1350390607928295424,2021-01-16T10:33:04.000Z,1212824799107375105,0,1,0,0,,[],[],[]


In [27]:
df_final = df_complete.append(df_tweets_with_entities)

In [28]:
df_final["created_at"].min()

'2020-01-01T00:00:00.000Z'

In [29]:
df_final["created_at"].max()

'2021-04-30T23:59:59.000Z'

In [30]:
df_final.to_csv(r"C:\Users\jawo19ad\Dropbox (CBS)\Master thesis data\df_tweets.csv")