## Definition

In [1]:
import tweepy
import couchdb3
import time
import pickle


def reform_tweet(raw_tweet):
    """
    Reform the tweet data into a dictionary form.
    Each tweet is a dictionary contains: tweet_id, text, author_id, create_at, geo
    """
    tweet_dict = dict()
    tweet_dict['_id'] = str(raw_tweet.id)
    tweet_dict['author_id'] = raw_tweet.author_id
    tweet_dict['text'] = raw_tweet.text
    tweet_dict['created_at'] = str(raw_tweet.created_at)
    tweet_dict['geo'] = str(raw_tweet.geo)

    return tweet_dict


def preprocess(raw_tweet, id_set, tweet_list):
    """
    Process all tweets in one turn, remove tweets withou geo info
    """
    for data in raw_tweet:
      if(data.id not in id_set):  #remove duplicate
        id_set.add(data.id)
        tweet_list.append(reform_tweet(data))
    
    return tweet_list, id_set


def get_tweet_1(query, token, tweets_each_turn, next_page = None):
    """
    Get tweet using Twitter API 2.0. 
    Using keywords(query) as a main parameter
    Caputure tweet_id, text, geo, author_id, create_at
    """
    client = tweepy.Client(bearer_token= token)
    # each time capture 100 tweets
    tweets = client.search_recent_tweets(query=query, tweet_fields=['author_id', 'created_at', 'geo'], max_results=tweets_each_turn, next_token=next_page)

    print(tweets.meta)

    return tweets.data, tweets.meta['next_token']


def crawler(query, tokens, tweets_each_turn, turns, file_name):
    """
    Main function. Use query as filter, token for authocation, and number of tweets each turn
    Turns suggest how many turns this function will run
    Get tweets from Twitter
    Remove tweets without geo
    Refrom tweets to dictionary type
    Save tweets as pickle file
    Automatically repeat this process.
    For one token, speed limit is 900 tweets/15min(1/sec)
    """

    time_gap = int (tweets_each_turn / len(tokens)) + 1
    count = 0
    next_page = None
    tweet_list = []
    id_set = set()
    while(count < turns):
        for token in tokens:
            data, next_page = get_tweet_1(query, token, tweets_each_turn, next_page)
            tweet_list, id_set = preprocess(data, id_set, tweet_list)
            time.sleep(time_gap)
        
        count = count + 1

    output = open(file_name,'wb')
    pickle.dump(tweet_list, output)
    output.close()
    
    return

In [2]:
def save_to_couchDB(client, tweet_data, db_name):
    if(client.up() == True):
        print("Connected to CouchDB")
    else:
        print("Unable to connect to CouchDB")
        return

    if( db_name not in client.all_dbs()):
        client.create(db_name)
    
    db = client.get(db_name)

    for data in tweet_data:
        db.save(data)
    
    print("Data is successfully saved to CouchDB")

    return

## Set up

In [3]:
BEARER_TOKEN = ["AAAAAAAAAAAAAAAAAAAAALWBbQEAAAAA%2FbQ0tpIE3uy14yUmYU0AiocoH6c%3DDkX3Fl2TdMFgRBCivYCSMajfqglkm8DkyylcAXkUFFceAIOBRB",
"AAAAAAAAAAAAAAAAAAAAAF1YbQEAAAAAEOLr26RmQ1V0eVq1xDR%2FUioYOKY%3DAHtIcXsDHv5lnyzj8KAdzlEbVVaC85k3uvvUvYESyeK0h9knqM"]                
query1 = '#Melbourne lang:en'
query2 = 'Melbourne rape lang:en'
query3 = 'Melbourne family violence lang:en'

client = couchdb3.Server(
    "http://172.26.132.196:5984",
    user="admin",
    password="admin"
)
db_name = "renkai_tweets"

In [4]:
client.up()

True

## Test

In [9]:
db = client.get(db_name)

In [15]:
ss = db.get('1519924999435747328')

In [17]:
dict(ss)

{'_id': '1519924999435747328',
 '_rev': '1-b2770f726bbf58c857e0a2268b8c26a2',
 'author_id': 2928313861,
 'text': 'RT @SEComForum: Hey @ibacVic in terms of transparency, are you going to publish the full interview of @DanielAndrewsMP for the public to se…',
 'created_at': '2022-04-29 06:22:13+00:00'}

In [18]:
str(None)

'None'

In [5]:
crawler(query1, BEARER_TOKEN, 10, 1, "test.pkl")

{'newest_id': '1519931529782525952', 'oldest_id': '1519928804961042434', 'result_count': 10, 'next_token': 'b26v89c19zqg8o3fpytotfyenn54e4yzbd3usbc592l4t'}
{'newest_id': '1519928624287264768', 'oldest_id': '1519926313905176576', 'result_count': 10, 'next_token': 'b26v89c19zqg8o3fpytotfyekktx5xnmrzwl786acp4e5'}


In [6]:
input = open('test.pkl','rb')
data1 = pickle.load(input)

In [7]:
data1[0]

{'_id': '1519931529782525952',
 'author_id': 351431656,
 'text': 'MULTILINGUAL NEWS SERVICE GREEK 29 April 22.\nAudio resources in your language to keep your community informed and safe during COVID-19.\n#news #covid19 #community #melbourne #Australia #multilingual #multicultural #radio #greek #vaccine\n https://t.co/azMCiR1vjH',
 'created_at': '2022-04-29 06:48:10+00:00',
 'geo': 'None'}

In [8]:
save_to_couchDB(client, data1, db_name)

Connected to CouchDB
Data is successfully saved to CouchDB
