## Definition

In [1]:
import tweepy
import couchdb3
import time
import pickle


def reform_tweet(raw_tweet):
    """
    Reform the tweet data into a dictionary form.
    Each tweet is a dictionary contains: tweet_id, text, author_id, create_at, geo
    """
    tweet_dict = dict()
    tweet_dict['_id'] = str(raw_tweet.id)
    tweet_dict['author_id'] = raw_tweet.author_id
    tweet_dict['text'] = raw_tweet.text
    tweet_dict['created_at'] = str(raw_tweet.created_at)
    tweet_dict['geo'] = raw_tweet.geo

    return tweet_dict


def preprocess(raw_tweet, tweet_list):
    """
    Process all tweets in one turn, remove tweets withou geo info
    """
    for data in raw_tweet:
        tweet_list.append(reform_tweet(data))
    
    return tweet_list


def get_tweet_1(query, token, tweets_each_turn, next_page = None):
    """
    Get tweet using Twitter API 2.0. 
    Using keywords(query) as a main parameter
    Caputure tweet_id, text, geo, author_id, create_at
    """
    client = tweepy.Client(bearer_token= token)
    # each time capture 100 tweets
    tweets = client.search_recent_tweets(query=query, tweet_fields=['author_id', 'created_at', 'geo'], max_results=tweets_each_turn, next_token=next_page)

    print(tweets.meta)

    return tweets.data, tweets.meta['next_token']


def crawler(query, tokens, tweets_each_turn, turns, client, db_name):
    """
    Main function. Use query as filter, token for authocation, and number of tweets each turn
    Turns suggest how many turns this function will run
    Get tweets from Twitter
    Remove tweets without geo
    Refrom tweets to dictionary type
    Save tweets as pickle file
    Automatically repeat this process.
    For one token, speed limit is 900 tweets/15min(1/sec)
    """

    time_gap = int (tweets_each_turn / len(tokens)) + 1
    count = 0
    next_page = None
    tweet_list = []
    while(count < turns):
        for token in tokens:
            data, next_page = get_tweet_1(query, token, tweets_each_turn, next_page)
            tweet_list = preprocess(data, tweet_list)
            time.sleep(time_gap)
        
        count = count + 1

    save_to_couchDB(client, tweet_list, db_name)
    
    return next_page

In [2]:
def save_to_couchDB(client, tweet_data, db_name):
    if(client.up() == True):
        print("Connected to CouchDB")
    else:
        print("Unable to connect to CouchDB")
        return

    if( db_name not in client.all_dbs()):
        print("No database:" + db_name + ", create one first")
        client.create(db_name)
    
    db = client.get(db_name)
    count = 0

    for data in tweet_data:
        if(data['_id'] not in db):
            db.save(data)
            count += 1
    
    print(str(count) + " tweets is successfully saved to CouchDB")

    return

## Set up

In [3]:
BEARER_TOKEN = ["AAAAAAAAAAAAAAAAAAAAALWBbQEAAAAA%2FbQ0tpIE3uy14yUmYU0AiocoH6c%3DDkX3Fl2TdMFgRBCivYCSMajfqglkm8DkyylcAXkUFFceAIOBRB",
"AAAAAAAAAAAAAAAAAAAAAF1YbQEAAAAAEOLr26RmQ1V0eVq1xDR%2FUioYOKY%3DAHtIcXsDHv5lnyzj8KAdzlEbVVaC85k3uvvUvYESyeK0h9knqM"]                
query1 = '#Melbourne lang:en'
query2 = 'Melbourne rape lang:en'
query3 = 'Melbourne family violence lang:en'

client = couchdb3.Server(
    "http://172.26.132.196:5984",
    user="admin",
    password="admin"
)
db_name = "renkai_tweets"

In [4]:
client.up()

True

## Test

In [5]:
next_token = crawler(query1, BEARER_TOKEN, 10, 2, client, db_name)

{'newest_id': '1519948731113607169', 'oldest_id': '1519945830207131648', 'result_count': 10, 'next_token': 'b26v89c19zqg8o3fpytotfz03y0ypqypvkkmdlod5mgzh'}
{'newest_id': '1519945606919245825', 'oldest_id': '1519943613941432322', 'result_count': 10, 'next_token': 'b26v89c19zqg8o3fpytotfz00wcpudqe12hgnijug4j99'}
{'newest_id': '1519943609180692481', 'oldest_id': '1519941612016709632', 'result_count': 10, 'next_token': 'b26v89c19zqg8o3fpytotfyzxv3w63bokjm3gdfs1yc1p'}
{'newest_id': '1519941331770130435', 'oldest_id': '1519939049095680000', 'result_count': 10, 'next_token': 'b26v89c19zqg8o3fpytotfyphjfp5p14picl6jdi3wlml'}
Connected to CouchDB
24 tweets is successfully saved to CouchDB


In [5]:
input = open('test.pkl','rb')
data1 = pickle.load(input)

In [6]:
data1[0]

{'_id': '1519942756415643648',
 'author_id': 1455843063440424969,
 'text': 'Winter rains #Melbourne #melbourneweather #iPhone13Pro https://t.co/ZeAp0h5aGO',
 'created_at': '2022-04-29 07:32:46+00:00',
 'geo': {'place_id': '13ecc33734165000'}}

In [7]:
save_to_couchDB(client, data1, db_name)

Connected to CouchDB
0 tweets is successfully saved to CouchDB


In [9]:
db = client.get(db_name)

In [17]:
db.all_docs()['rows'][0]['id']

'1519936848461983745'

In [18]:
'1519936848461983745' in db

True

In [None]:
ss = db.get('1519924999435747328')