# Account Status
In this notebook we query the status of the users in the RT network with the Authors API.

- If the user is found, the API returns the info about its account
- If the user is suspended, it returns error "account suspended"
- If the user deleted the account itself, the API does not find him

By distinguishing between the type of error, we can find the status of all the users.

We save dataframes with columns user - account status in folder_EU_AM

The limit for the API is 450 request every 5 minutes, so, when I reach the limit, I sleep the program for 5 minutes before resuming. 


In [10]:
import pandas as pd
from glob import glob
import numpy as np
import tweepy
import json
import time
import matplotlib.pyplot as plt
from pandas.core.common import flatten
import collections

In [6]:
folder_EU_AM = "/data/public/jlenti/multilang-vax/EuropeAmerica_RTCO"
periods = ["period1", "period2", "period3", "period4"]
#dataframes with all the pairs lang-countries, with the selected countries
selected_pairs = pd.read_csv("/home/jlenti/Files/country_langs_selected_2104.csv", index_col = 0)


In [2]:
#keys and tokens for Twitter Streaming API
consumer_key = "k1yozn6RVRi8SZPUe2DYYRHEc"

consumer_secret = "m2S73Xz5NCPeMogXvmEP5ResTFf9BiKTJSytkgBtInMWHndhFT"
access_token = "1397491324203544577-UBmwMXd9RGwyY2WF9fM78KAvBkyGh2"
access_token_secret = "UfsydKasrsKObUhPpHKE1YZHOhegKiutvQqCSXBIbds9A"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)


In [5]:
def users_status_list(usr):
    users_status = []
    for user in usr:
            #need a while cycle to repeat uncompleted iterations. When I exceed API limit it return a rate error.
            #when such error raises I repeat the iteration (after 5 minutes)
            #completed T/F is used to decide when to repeat the iteration
            completed = False
            while not completed:
                try:
                    #retrieve with API 
                    usr_api = api.get_user(user)
                    #if everything ok the user exists, and I label user as "found"
                    users_status.append((user, "found"))
                    completed = True
                except tweepy.TweepError as e:
                    #it gives error, "username is suspended" if the user has been suspended
                    if "suspended" in str(e):
                        users_status.append((user, "suspended"))
                        completed = True
                    #it gives error "username not found" if it does not exist and it is not suspended (may be deleted)
                    elif "not found" in str(e):
                        users_status.append((user, "not found"))
                        completed = True
                    #when I exceed API limit I sleep the program for 5 minutes and repeat the step (completed is False)
                    elif "Rate limit exceeded" in str(e):
                        print("Sleep ", len(users_status))
                        time.sleep(60*5)
    return users_status
    

## Example

In [20]:
country, lang = "IT", "it"
period = "period1"

#users = pd.read_csv(sorted(glob()))

In [22]:
#keep only 40 users to fast the example
users = pd.read_csv(sorted(glob("/".join([folder_EU_AM, period, "*".join([country, lang, "RT", "com", ""])])))[0])["user"].tolist()[:40]

In [23]:
status = users_status_list(users)

In [26]:
users_status = pd.DataFrame(status, columns = ["user", "status"])
users_status.head()

Unnamed: 0,user,status
0,000Salvatore,suspended
1,CriticaScient,found
2,DavideFalchieri,not found
3,FmMosca,suspended
4,GavinoSanna1967,found


## All countries

In [None]:
for period in periods:
    for _, (country, lang) in selected_pairs.iterrows():
        #extract all users in RT network and CO network
        RT_users = pd.read_csv(sorted(glob("/".join([folder_EU_AM, period, "*". \
                                                     join([country, lang, "RT", "com", ""])])))[0])["user"].tolist()
        CO_users = pd.read_csv(sorted(glob("/".join([folder_EU_AM, period, "*". \
                                                     join([country, lang, "CO", "com", ""])])))[0])["user"].tolist()
        users = set(RT_users)|set(CO_users)
        #read the account status from previous periods, same country, to find users that we aljust know the status
        previous_users_satus = pd.concat([pd.read_csv(file).query("user in @users")
                                    for file in sorted(glob("/".join([folder_EU_AM, "*", country + "*status*"])))
                                   ])
        #since the procedure is very time consuming, we request the status only of the users we don't know yet
        new_users = set(users) - set(previous_users[user])
        #request the status
        new_status = users_status_list(users)
        #build the pandas dataframe
        new_users_status = pd.DataFrame(status, columns = ["user", "status"])
        #create a dataframe with the users status of the users in all the periods we know
        users_status = pd.concat([previous_users_status, new_users_status])
        #save the dataframe        
        users_status.to_csv("/".join([folder_EU_AM, period, "_".join(country, lang, period, "user", "status.csv")]))        
        