# Data Gathering From Social Media     

This notebook will contain the process of gathering data from Twitter for a given account. 

In [8]:
#!pip install inflect
#!pip install scispacy

In [4]:
"""
Useful Libraries
"""
import sys
import numpy as np
import joblib
import tqdm
import tweepy
import pandas as pd
import nltk
from tweepy import OAuthHandler
from textblob import TextBlob
from tweepy import Cursor
from dateutil import parser

import inflect
import string, unicodedata
from datetime import datetime
import re

import spacy
from scipy.spatial.distance import jensenshannon

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
"""
Twitter Authentification Credentials. 
You must replace the following variables with their corresponding values 
from your twitter account.
"""
cons_key = 'xxxxxxxxxxxxxxxxxxxxxxxx'
cons_secret = 'xxxxxxxxxxxxxxxxxxxxxxxx'
acc_token = 'xxxxxxxxxxxxxxxxxxxxxxxx'
acc_secret = 'xxxxxxxxxxxxxxxxxxxxxxxx'

In [12]:
"""
Useful functions 
"""

def get_twitter_auth():
    try:
        consumer_key = cons_key
        consumer_secret = cons_secret
        access_token = acc_token
        access_secret = acc_secret
        
    except KeyError:
        sys.stderr.write("Twitter Environment Variable not Set\n")
        sys.exit(1)
        
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    
    return auth


def get_twitter_client():
    auth = get_twitter_auth()
    client = tweepy.API(auth, wait_on_rate_limit=True)
    return client


"""
This function gets tweets from a username.
The total number of tweet the API can retrive is limited to 3200.
The count is made so that count_tweet x page_limit = 3200
"""
def get_tweets_from_user(twitter_user_name, page_limit=16, count_tweet=200):
    client = get_twitter_client()
    
    tweets = []
    
    for page in Cursor(client.user_timeline, 
                        screen_name=twitter_user_name, 
                        count=count_tweet).pages(page_limit):
        for tweet in page:
            parsed_tweet = {}
            parsed_tweet['date'] = tweet.created_at
            parsed_tweet['author'] = tweet.user.name
            parsed_tweet['twitter_name'] = tweet.user.screen_name
            parsed_tweet['text'] = tweet.text
            parsed_tweet['number_of_likes'] = tweet.favorite_count
            parsed_tweet['number_of_retweets'] = tweet.retweet_count
                
            tweets.append(parsed_tweet)
        
    return tweets


"""
This function formats a tweet date into a day/month/year format
example: 
    - original date: "2020-08-26 15:00:02"
    - formated date: '26/08/2020'
"""
def format_tweet_date(date):
    return parser.parse(date).strftime("%d/%m/%Y")

In [13]:
"""
Useful functions for date formatting
"""
def get_day(date):
    dt = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    return dt.day

def get_month(date):
    dt = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    return dt.month

def get_year(date):
    dt = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    return dt.year

In [14]:
#get_tweets_from_user_print("vboykis", 2)

In [15]:
"""
Function used to create a dataframe from the tweets
"""
def create_df_from_tweets(twitter_user_name, page_limit=16, count_tweet=200):
    
    list_tweets = get_tweets_from_user(twitter_user_name, page_limit, count_tweet)
    df = pd.DataFrame(list_tweets)
    
    # Remove duplicates:
    df = df.drop_duplicates( "text" , keep='first')
    
    # format the date
    #df["date"] = df["date"].apply(lambda date : format_tweet_date(str(date)))
    df["year"] = df["date"].apply(lambda date : get_year(str(date)))
    df["month"] = df["date"].apply(lambda date : get_month(str(date)))
    df["day"] = df["date"].apply(lambda date : get_day(str(date)))
    
    return df

In [16]:
"""
Here we want to retrieve all the tweets from Facebook AI twitter account
"""
df = create_df_from_tweets("facebookai")

In [17]:
len(df)

498

In [18]:
df.head(5)

Unnamed: 0,date,author,twitter_name,text,number_of_likes,number_of_retweets,year,month,day
0,2020-09-03 21:54:44,Facebook AI,facebookai,RT @fb_engineering: Congratulations to the @Hy...,0,6,2020,9,3
1,2020-08-31 18:28:25,Facebook AI,facebookai,"We’re releasing fairmotion, a library to help ...",636,185,2020,8,31
2,2020-08-31 17:13:07,Facebook AI,facebookai,"RT @schrep: Today we launched Opacus, a high-s...",0,112,2020,8,31
3,2020-08-31 16:31:41,Facebook AI,facebookai,"Introducing Opacus, a new high-speed library f...",182,60,2020,8,31
4,2020-08-27 03:47:07,Facebook AI,facebookai,We've achieved SoTA results in navigation &amp...,229,66,2020,8,27
