### Test the running time

For using our model on a website, we need to focus on if the whole predicted process can be done in a short time. So it needs to test the running time of each steps.

#### Step 1: crawl the user information from Twitter API and transfer it to the features we want to feed our classifiers

##### step 1.1: measure the running time of the features from Account:

Count of favorite tweets: directly from json favorite_count

Friends to follower ratio: json friends_count/followers_count

Total status count: json statuses_count

Default profile image: json default_profile_image -> transform into one hot vector 

Default profile: json default_profile -> transform into one hot vector (same as default profile image, just make them binary category vars)

Account ages: in real time = json created_at - the crawled date. In training set: json created_at - crawled_at

User name and screen_name = json:  count # of char and # of numbers. See this paper 1-s2.0-S016740481730250X-main “The assumption of this method is that the username pattern “letter + number” was highly correlated with spamming accounts”

Length of description: json len(description) 

Description text - TFIDF

Average tweet per day: In real time: statuses_count/(the crawled date - created_at).days() In training set: json statuses_count/(crawled_at - created_at).days() 

In [11]:
import twitter
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

api = twitter.Api(consumer_key=CONSUMER_KEY,
                     consumer_secret=CONSUMER_SECRET,
                     access_token_key=ACCESS_TOKEN_KEY,
                     access_token_secret=ACCESS_TOKEN_SECRET)


In [12]:
def get_user(user_id=None, screen_name=None):
    json = api.GetUser(user_id=user_id, screen_name=screen_name, include_entities=True, return_json=False)
    json_data = json._json
    
    # feature Count of favorite tweets
    Count_of_favorite_tweets = int(json_data['favourites_count'])
    # feature Friends to follower ratio
    Friends_to_follower_ratio = float(json_data['friends_count']) / json_data['followers_count']
    # feature Total status count
    Total_status_count = int(json_data['statuses_count'])
    
    # feature Default profile image & Default profile
    if json_data['default_profile_image'] == 'FALSE':
        Default_profile_image = 0
    else:
        Default_profile_image = 1

    if json_data['default_profile'] == 'FALSE':
        Default_profile = 0
    else:
        Default_profile = 1
    
    # feature Account ages
    created_at = json_data['created_at']
    Account_age = survival_time(created_at)

    # feature User name and screen_name
    User_name = json_data['name']
    Screen_name = json_data['screen_name']
    User_name_digit, User_name_char = counter(User_name)
    Screen_name_digit, Screen_name_char = counter(Screen_name)

    # feature Length of description and Description text
    description_pre = json_data['description']
    Description_length, Description_tfidf = preprocess_description(description_pre)
    
    # feature Average tweet per day
    Average_tweets_per_day = Total_status_count / float(Account_age)

    feature = list()
    feature.append(Count_of_favorite_tweets)
    feature.append(Friends_to_follower_ratio)
    feature.append(Total_status_count)
    feature.append(Default_profile_image)
    feature.append(Default_profile)
    feature.append(Account_age)
    feature.append(User_name_digit)
    feature.append(User_name_char)
    feature.append(Screen_name_digit)
    feature.append(Screen_name_char)
    feature.append(Description_length)
    feature.append(Description_tfidf)
    feature.append(Average_tweets_per_day)

    return feature

In [13]:
def survival_time(created_at):
    # get the account ages: crawl at time - created at time 
    current_time = time.localtime(time.time())
    current_year = current_time.tm_year
    current_month = current_time.tm_mon
    current_day = current_time.tm_mday

    meta = created_at.split(" ")
    created_month = meta[1]
    if created_month == 'Jan':
        created_month = int(1)
    elif created_month == 'Feb':
        created_month = int(2)
    elif created_month == 'Mar':
        created_month = int(3)
    elif created_month == 'Apr':
        created_month = int(4)
    elif created_month == 'May':
        created_month = int(5)
    elif created_month == 'Jun':
        created_month = int(6)
    elif created_month == 'Jul':
        created_month = int(7)
    elif created_month == 'Aug':
        created_month = int(8)
    elif created_month == 'Sep':
        created_month = int(9)
    elif created_month == 'Oct':
        created_month = int(10)
    elif created_month == 'Nov':
        created_month = int(11)
    elif created_month == 'Dec':
        created_month = int(12)
    created_day = int(meta[2])
    created_year = int(meta[5])

    Account_age = (current_year - created_year) * 365 + (current_month - created_month) * 30 + (current_day - created_day)

    return Account_age

In [14]:
def counter(name):
    # counter of char & counter of digit
    numbers = sum(c.isdigit() for c in name)
    words   = sum(c.isalpha() for c in name)

    return numbers, words

In [30]:
def preprocess_description(description):
    # preprocess the description text
    description_length = len(description)
    des_list = list()
    des_list.append(description)
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(des_list)
    # X_train_counts.shape
    # tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    # X_train_tf = tf_transformer.transform(X_train_counts)
    # X_train_tf.shape
    tfidf_transformer = TfidfTransformer()
    des_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    return description_length, des_train_tfidf

In [31]:
def measure_running_time(user_id=None, screen_name=None):
    # measure the running time of getting the features from the input
    start = time.clock()
    get_user(user_id=user_id, screen_name=screen_name)
    end = time.clock()
    print('function took %0.5f ms' % ((end-start)*1000.0))

In [32]:
measure_running_time(screen_name='realDonaldTrump')

function took 31.81400 ms


In [33]:
feature = get_user(screen_name='realDonaldTrump')
print feature

[24, 8.95377203314376e-07, 37270, 1, 1, 3307, 0, 12, 0, 15, 50, <1x7 sparse matrix of type '<type 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>, 11.27003326277593]
