# Get user information from tweets

Based on a tweet dataset, we want to gather all the Twitter user information and use it in our analysis.
The main goal of this Notebook is to get user information from users that tweeted themselves or are mentioned in tweets.

In this Notebook, we will do the following:
1. [Get the user information of every user that sent a tweet in the dataset](#section-1)
2. [Get the user information of every mentioned user](#section-2)

In [3]:
# Import necessary packages
import pandas as pd

# Import the tweet data
data = pd.read_pickle("~/Documents/Github Repository/early-warning-twitter/Processed datasets/Tweets/01-06-2020-amsterdam-demonstration.pkl")

<a id="section-1"></a>
## 1. Get the user information of every user that sent a tweet in the dataset
For every user that tweeted in the dataset, we will get all the user information and store it in a DataFrame.

In [4]:
import tweepy 
  
# assign the values accordingly 
consumer_key= ''
consumer_secret= ''
access_token= ''
access_token_secret= '' 
  
# authorization of consumer key and consumer secret 
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 
  
# set access to user's access key and access secret  
auth.set_access_token(access_token, access_token_secret) 
  
# calling the api  
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) 

In [None]:
# Two dataframes for storing the users
# Users is a dataframe with Twitter users
# not_users is a dataframe with users that are not users anymore or their account is suspended
users = pd.DataFrame(columns = ['screen_name', 'followers_count', 'friends_count', 'listed_count', 'created_at', 'favourites_count', 'verified']) 
not_users = pd.DataFrame(columns = ['screen_name'])

# Necessary to print out how many tweets we already analyzed
tweets = 0
tweet_benchmark = 10000

# Necessary to handle the for loop
o = 0
l = 0

# Created a list to handle the users that we already treated
# Makes sure that if @minPres is already handled, we will not handle it again
# We need to create a seperate list, because the API sometimes converts 'minPres' to 'minpres' (and then it is not in the dataframe)
treated_users = []

# For every mentioned user, get information of the user
for index, screen_name in data["user_screen_name"].iteritems():
    tweets = tweets + 1
    
    # If 10.000 tweets have been analyzed, print out an alert
    if(tweets > tweet_benchmark):
        tweet_benchmark = tweet_benchmark + 10000
        print("We already analyzed "+ str(tweets) + " tweets!")
    
    # Check if tweet has user 
    if screen_name != None:
            
        # Check if we already treated the user
        if not screen_name in treated_users:
            treated_users.append(screen_name)
                
            try:
                # Make API call to get user information and store it in a dataframe
                user = api.get_user(screen_name)
                users.loc[o,'screen_name'] = user.screen_name
                users.loc[o, 'tweet_count'] = user.statuses_count
                users.loc[o,'followers_count'] = user.followers_count
                users.loc[o,'friends_count'] = user.friends_count
                users.loc[o,'listed_count'] = user.listed_count
                users.loc[o,'created_at'] = user.created_at
                users.loc[o,'favourites_count'] = user.favourites_count
                users.loc[o,'verified'] = user.verified
                users.loc[o,'description'] = user.description
                o = o+1   # Necessary to handle the for loop
            except tweepy.TweepError as e:
                # Print screen_name and error
                print(screen_name)
                print(e)
                # Store screen_name in not_users dataframe
                not_users.loc[l,'screen_name'] = screen_name
                l = l+1

# Drop duplicates in the users and non_users dataframes
users.drop_duplicates(subset='screen_name', keep="first", inplace=True)
not_users.drop_duplicates(subset='screen_name', keep="first", inplace=True)

# Remove the enters from the description so that we can save the dataframe as a CSV
users['description'] = users['description'].replace('\n\n',' ', regex=True) 
users['description'] = users['description'].replace('\n',' ', regex=True)

def var_to_lower(row, variable):
    text = row[variable]
    if type(text)==str:
        return text.lower()
    else:
        return None

# Make the screen_name variable lowercase
users["screen_name"] = users.apply(var_to_lower, args=(["screen_name"]), axis=1)
    
# Make description_lower variable with the description in lower case
users["description_lower"] = users.apply(var_to_lower, args=(["description"]), axis=1)

# Reset the index of the users dataframe
users = users.reset_index(inplace=True, drop=True)

In [5]:
users.to_csv("~/Documents/Github Repository/early-warning-twitter/Processed datasets/Users/01-06-2020-amsterdam-demonstration-all-users-that-tweeted.csv")
users.to_pickle("~/Documents/Github Repository/early-warning-twitter/Processed datasets/Users/01-06-2020-amsterdam-demonstration-all-users-that-tweeted.pkl")

<a id="section-2"></a>
## 2. Get the user information of every user that is mentioned in the dataset
For every user that is mentioned in the dataset, we will get all the user information and store it in a DataFrame.

In [None]:
# Two dataframes for storing the users
# Users is a dataframe with Twitter users
# not_users is a dataframe with mentioned users who are actually not users or suspended users
mentioned_users = pd.DataFrame(columns = ['screen_name', 'followers_count', 'friends_count', 'listed_count', 'created_at', 'favourites_count', 'verified']) 
mentioned_not_users = pd.DataFrame(columns = ['screen_name'])

# Necessary to print out how many tweets we already analyzed
tweets = 0
tweet_benchmark = 10000

# Necessary to handle the for loop
o = 0
l = 0

# Created a list to handle the users that we already treated
# Makes sure that if @minPres is already handled, we will not handle it again
# We need to create a seperate list, because the API sometimes converts 'minPres' to 'minpres' (and then it is not in the dataframe)
treated_users = []

# For every mentioned user, get information of the user
for index, user_mentions in data["user_mentions"].iteritems():
    tweets = tweets + 1
    
    # If 10.000 tweets have been analyzed, print out an alert
    if(tweets > tweet_benchmark):
        tweet_benchmark = tweet_benchmark + 10000
        print("We already analyzed "+ str(tweets) + " tweets!")
    
    # Check if tweet has mentioned_user
    if user_mentions != None:
        
        # For every user in user_mentions
        for screen_name in user_mentions:
            
            # Check if we already treated the user
            if not screen_name in treated_users:
                treated_users.append(screen_name)
                
                # Check if we already have information on the user in another DataFrame
                if ((users['screen_name']==screen_name).any() == True):
                    index_row = users[users['screen_name']==screen_name].index.values.astype(int)[0]
                    mentioned_users.loc[o,'screen_name'] = users.loc[index_row,'screen_name']
                    mentioned_users.loc[o, 'tweet_count'] = users.loc[index_row, 'tweet_count']
                    mentioned_users.loc[o,'followers_count'] = users.loc[index_row,'followers_count']
                    mentioned_users.loc[o,'friends_count'] = users.loc[index_row,'friends_count']
                    mentioned_users.loc[o,'listed_count'] = users.loc[index_row,'listed_count'] 
                    mentioned_users.loc[o,'created_at'] = users.loc[index_row,'created_at']
                    mentioned_users.loc[o,'favourites_count'] = users.loc[index_row,'favourites_count']
                    mentioned_users.loc[o,'verified'] = users.loc[index_row,'verified']
                    mentioned_users.loc[o,'description'] = users.loc[index_row,'description']
                    o = o+1   # Necessary to handle the for loop
                
                else:
                    try:
                        # Make API call to get user information and store it in a dataframe
                        user = api.get_user(screen_name)
                        mentioned_users.loc[o,'screen_name'] = user.screen_name
                        mentioned_users.loc[o, 'tweet_count'] = user.statuses_count
                        mentioned_users.loc[o,'followers_count'] = user.followers_count
                        mentioned_users.loc[o,'friends_count'] = user.friends_count
                        mentioned_users.loc[o,'listed_count'] = user.listed_count
                        mentioned_users.loc[o,'created_at'] = user.created_at
                        mentioned_users.loc[o,'favourites_count'] = user.favourites_count
                        mentioned_users.loc[o,'verified'] = user.verified
                        mentioned_users.loc[o,'description'] = user.description
                        o = o+1   # Necessary to handle the for loop
                    except tweepy.TweepError as e:
                        # Print screen_name and error
                        print(screen_name)
                        print(e)
                        # Store screen_name in mentioned_not_users dataframe
                        mentioned_not_users.loc[l,'screen_name'] = screen_name
                        l = l+1

# Drop duplicates in the users and non_users dataframes
mentioned_users.drop_duplicates(subset='screen_name', keep="first", inplace=True)
mentioned_not_users.drop_duplicates(subset='screen_name', keep="first", inplace=True)

# Remove the enters from the description so that we can save the dataframe as a CSV
mentioned_users['description'].replace('\n\n',' ', regex=True, inplace=True)
mentioned_users['description'].replace('\n',' ', regex=True, inplace=True)
mentioned_users['description'].replace('\r',' ', regex=True, inplace=True)

def var_to_lower(row, variable):
    desc = row[variable].lower()
    return desc

# Make screen_name variable lowercase
mentioned_users["screen_name"] = mentioned_users.apply(var_to_lower, args=(["screen_name"]), axis=1)

# Make description_lower variable with the description in lower case
mentioned_users["description_lower"] = mentioned_users.apply(var_to_lower, args=(["description"]), axis=1)

mentioned_users.reset_index(inplace=True, drop=True)

# Export the mentioned users
mentioned_users.to_csv("~/Documents/Github Repository/early-warning-twitter/Processed datasets/Users/01-06-2020-amsterdam-demonstration-all-users-that-mentioned.csv")
mentioned_users.to_pickle("~/Documents/Github Repository/early-warning-twitter/Processed datasets/Users/01-06-2020-amsterdam-demonstration-all-users-that-mentioned.pkl")