# Program for collecting twitter user account properties for bot detection

- The auth keys obtained from Twitter are kept hidden.
- The results of this program can be reproduced by obtaining new auth keys and replacing them in 'key' array
defined in further cells.

In [None]:
import tweepy
import datetime as dt
import copy
import numpy as np

dow_ratios = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}

def mine_data(screenname, api):
    tweets = []
    tbl = []
    tweets_parsed = 0
    hashtags_recorded = 0
    user_mentions_recorded = 0
    urls = []
    tweet_times = []
    mal_urls_ratio = 0
    tweets_per_day = [-1]
    cur_date = dt.datetime.today()
    date_count = 0

    user = api.get_user(screenname)
    user_id = user.id

    print("Screen name - ", screenname)
    description = user.description
    print("USER DESCRIPTION - ", description)
    age = dt.datetime.today().timestamp() - user.created_at.timestamp()
    # print("User Age: ", age, " seconds")

    in_out_ratio = 1
    if user.friends_count != 0:
        in_out_ratio = user.followers_count / user.friends_count

    favourites_ratio = 86400 * user.favourites_count / age
    # print("favourites_ratio: ", favourites_ratio)

    status_ratio = 86400 * user.statuses_count / age
    # print("status_ratio: ", status_ratio)

    acct_rep = 0
    if user.followers_count + user.friends_count != 0:
        acct_rep = user.followers_count / (user.followers_count + user.friends_count)
        # print("acct_rep: ", acct_rep)

    # tbl.append(user_id)
    tbl.append(user.screen_name)
    tbl.append(age)
    tbl.append(in_out_ratio)
    tbl.append(favourites_ratio)
    tbl.append(status_ratio)
    tbl.append(acct_rep)

    if not user.protected:
        for tweet in tweepy.Cursor(api.user_timeline, id=user_id, tweet_mode='extended').items(1000):
            update_dow_ratios(tweet.created_at.weekday())
            txt = tweet._json['full_text']
            tweets.append(txt)

            if len(tweet.entities['urls']) > 0:
                for url in tweet.entities['urls']:
                    urls.append(url['expanded_url'])

            if len(tweet.entities['hashtags']) > 0:
                hashtags_recorded += len(tweet.entities['hashtags'])

            if len(tweet.entities['user_mentions']) > 0:
                user_mentions_recorded += len(tweet.entities['user_mentions'])

            if date_count == 0:
                cur_date = tweet.created_at
                tweets_per_day.append(0)
                date_count += 1
                tweets_per_day[date_count] += 1
                
            elif tweet.created_at.day != cur_date.day:
                cur_date = tweet.created_at
                date_count += 1
                tweets_per_day.append(0)
                tweets_per_day[date_count] += 1
                
            else:
                tweets_per_day[date_count] += 1

            tweet_times.append(tweet.created_at)

            tweets_parsed += 1

        if tweets_parsed == 0:
            print('No tweets parsed, skipping record...')
            return []

        if len(tweets) == 0 or len(tweet_times) == 0:
            print('No tweets present, skipping record...')
            return []

        if len(tweets) <= 50:
            print('Not enough tweet data, skipping record...')
            return []

        for key in dow_ratios:
            flat_val = dow_ratios[key]
            dow_ratios[key] = flat_val / tweets_parsed

        urls_ratio = len(urls) / tweets_parsed
        hashtags_ratio = hashtags_recorded / tweets_parsed
        user_mentions_ratio = user_mentions_recorded / tweets_parsed
        tweets_per_day = tweets_per_day[1:]
        avg_tpd = np.average(tweets_per_day)

        tbl.append(avg_tpd)
        tbl.append(hashtags_ratio)
        tbl.append(user_mentions_ratio)
        tbl.append(user.location)
        tbl.append(description)
        tbl.append(user.verified)
        tbl.append(user.id)

        return copy.deepcopy(tbl)

    else:
        print("Protected Account: {}".format(screenname))
        return []


def update_dow_ratios(weekday):
    dow_ratios[weekday] += 1

In [None]:
def get_data(screenname, api):
    tbl = []
    try:
        tbl = mine_data(screenname, api)
        print(tbl)
        return tbl
    except tweepy.TweepError as e:
        print(e)
        return tbl

In [None]:
import tweepy
from tweepy import OAuthHandler
import csv

def get_api(consumer_key, consumer_secret, access_token, access_secret):
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    return tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)

In [None]:
key = ['xxxx',
       'xxxx',
       'xxxx-xxxx',
       'xxxxxxxxx']

api = get_api(key[0], key[1], key[2], key[3])

In [None]:
def lookup(user_id):
    X = get_data(user_id, api)
    return X

In [None]:
with \
        open('DownsampledCoreData.csv',
             'r+',
             encoding="utf-8") as inp, \
        open('DownsampledCoreDataForBotTest.csv',
             'w+',
             encoding="utf-8") as out, \
        open('DownsampledCoreDataSuspendedUsers.csv',
             'w+',
             encoding="utf-8") as out2:

    reader = csv.DictReader(inp)

    my_fields = ['id',
                 'screen_name',
                 'location',
                 'age',
                 'in_out_ratio',
                 'favorites_ratio',
                 'status_ratio',
                 'account_rep',
                 'avg_tpd',
                 'hashtags_ratio',
                 'user_mentions_ratio',
                 'description',
                 'verified',
                 'bot']

    writer = csv.DictWriter(out, fieldnames=my_fields)
    writer.writeheader()

    writer2 = csv.DictWriter(out2, fieldnames=my_fields)
    writer2.writeheader()

    for row in reader:
    data = (lookup(row['screenname'].replace('"', '')))
    if not data:
        print('Skipping record!')
        writer2.writerow({'id': 000,
                          'screen_name': row['screenname'],
                          'location': 'NaN',
                          'age': 0,
                          'in_out_ratio': 0,
                          'favorites_ratio': 0,
                          'status_ratio': 0,
                          'account_rep': 0,
                          'avg_tpd': 0,
                          'hashtags_ratio': 0,
                          'user_mentions_ratio': 0,
                          'description': 0,
                          'verified': 0,
                          'bot': 0})
    else:
        writer.writerow({'id': data[12],
                         'screen_name': row['screenname'],
                         'location': data[9],
                         'age': data[1],
                         'in_out_ratio': data[2],
                         'favorites_ratio': data[3],
                         'status_ratio': data[4],
                         'account_rep': data[5],
                         'avg_tpd': data[6],
                         'hashtags_ratio': data[7],
                         'user_mentions_ratio': data[8],
                         'description': data[10],
                         'verified': data[11],
                         'bot': 0})