### DSPT6 - Unit 3 Module 2 - Consuming Data from an API

The purpose of this notebook is to demonstrate:
- Connect to the Twitter API (and twitter_scraper) to query for tweets and user information by various parameters
- Convert tweet text using SpaCy into numerical embeddings that can be use in a predictive model

In [1]:
# Import a way to connect with the Twitter API:
import tweepy

In [3]:
# Add your credentials here:
TWITTER_KEY = ''
TWITTER_SECRET = ''
TWITTER_TOKEN = ''
TWITTER_TOKEN_SECRET = ''

In [4]:
# Access Twitter API:
TWITTER_AUTH = tweepy.OAuthHandler(TWITTER_KEY, TWITTER_SECRET)
TWITTER_AUTH.set_access_token(TWITTER_TOKEN, TWITTER_TOKEN_SECRET)
TWITTER = tweepy.API(TWITTER_AUTH)

In [5]:
# Look at what is available in TWITTER class object:
dir(TWITTER)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_list_members',
 '_lookup_friendships',
 '_pack_image',
 '_remove_list_members',
 '_send_direct_message',
 'add_list_member',
 'add_list_members',
 'api_root',
 'auth',
 'blocks',
 'blocks_ids',
 'cache',
 'compression',
 'configuration',
 'create_block',
 'create_favorite',
 'create_friendship',
 'create_list',
 'create_media_metadata',
 'create_mute',
 'create_saved_search',
 'destroy_block',
 'destroy_direct_message',
 'destroy_favorite',
 'destroy_friendship',
 'destroy_list',
 'destroy_mute',
 'destroy_saved_search',
 'destroy_status',
 'favorites',
 'followers',
 'followers_ids',
 'friends',
 'friends_ids',
 'f

In [6]:
# Get a Twitter User infomation:
twitter_user = TWITTER.get_user('elonmusk')
twitter_user

User(_api=<tweepy.api.API object at 0x00000167151B0EE0>, _json={'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': '', 'profile_location': None, 'description': '', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 39211093, 'friends_count': 97, 'listed_count': 56328, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 6813, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': True, 'statuses_count': 12528, 'lang': None, 'status': {'created_at': 'Wed Oct 07 21:23:11 +0000 2020', 'id': 1313953038378434561, 'id_str': '1313953038378434561', 'text': '@TeslaGong @TeslaTested Yes', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'TeslaGong', 'name': 'Tesla in the Gong', 'id': 1008296232261783552, 'id_str': '1008296232261783552', 'indices': [0, 10]}, {'screen_name': 'TeslaTested', 'name': 'Zach', 'id': 1153084332078243

In [7]:
# Get the Twitter user's id number:
twitter_user.id

44196397

In [8]:
# Look at Elon Musk's timeline:
elon_tweets = twitter_user.timeline()
elon_tweets

use_background_image=True, has_extended_profile=True, default_profile=False, default_profile_image=False, following=False, follow_request_sent=False, notifications=False, translator_type='none'), geo=None, coordinates=None, place=None, contributors=None, is_quote_status=False, retweet_count=120, favorite_count=1527, favorited=False, retweeted=False, lang='en'),
 Status(_api=<tweepy.api.API object at 0x00000167151B0EE0>, _json={'created_at': 'Wed Oct 07 15:31:48 +0000 2020', 'id': 1313864611997855744, 'id_str': '1313864611997855744', 'text': '@raytech247 @WholeMarsBlog Even GM &amp; Chrysler went bankrupt in 2009. Ford and Tesla are the only US car companies t… https://t.co/ieaOuIsJRi', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'raytech247', 'name': 'Adam Raymer-Brown', 'id': 250481596, 'id_str': '250481596', 'indices': [0, 11]}, {'screen_name': 'WholeMarsBlog', 'name': 'Whole Mars Catalog', 'id': 1263491240336769026, 'id_str': '126

In [9]:
# Look at the first tweet in the list:
elon_tweets[0].text

'@TeslaGong @TeslaTested Yes'

In [10]:
# Look at the length of Elon's tweets:
len(elon_tweets)

20

In [11]:
# Get just Elon's tweets, no replies or retweets:
# Using this code the replies and retweets are included in the 200 count but 
#   will not return them, so you might not get a full 200 tweets.
# To get the max ID# need to run `elon_tweets[19].id` in another cell 
#   (the [19] should be replaced with the len - 1 number).
elon_tweets = twitter_user.timeline(count=200,  # Num of tweets to retrieve, max 200/request
                            exclude_replies=True,  # Don't include replies when 'True'
                            include_rts=False,  # Don't include retweets when 'False'
                            # max_id='',  # Get results with ID# >= specified ID#
                            tweet_mode='extended')  # Gives you the full text of tweet

# Look at the length of Elon's tweets now:
len(elon_tweets)

18

In [12]:
# View Elon's 2nd tweet (in extended mode need to use `.full_text` vs `.text` in reg mode):
elon_tweets[1].full_text

'5 minutes from launch. Looks good so far. https://t.co/on2f1pY5jt'

### SpaCy Embeddings
Will turn the text strings into numeric value which can be used later in a machine learning model.

In [13]:
import spacy

In [14]:
# Load the med English language package,
#   disabling the tagger and parser will help to speed up the process some.
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser'])

In [15]:
# Add some random words to see if they are in the pre-trained vector:
tokens = nlp('dog cat banana boy Halloween fall cianeua')

for token in tokens:
    print(f'{token.text} - Is in vector = {token.has_vector} - Is not in vector = {token.is_oov}')

dog - Is in vector = False - Is not in vector = True
cat - Is in vector = False - Is not in vector = True
banana - Is in vector = False - Is not in vector = True
boy - Is in vector = False - Is not in vector = True
Halloween - Is in vector = False - Is not in vector = True
fall - Is in vector = False - Is not in vector = True
cianeua - Is in vector = False - Is not in vector = True


In [16]:
# Look at the dog vector
dog = nlp('dog')
dog.vector

array([], dtype=float32)

In [17]:
# Pass in the full text of the 2nd tweet to get the weighted average in a 300D vector,
#  this will then be passed into the machine learning model.
tweet_embedding = nlp(elon_tweets[1].full_text)
tweet_embedding.vector

array([], dtype=float32)

### Bringing it all together

In [18]:
# Will get the 300D vector for each tweet:
def vectorize_tweet(nlp, tweet_text):
    return nlp(tweet_text).vector

In [19]:
# Function to add or update user info in Database:
def add_or_update_user(username, nlp):
    try:
        twitter_user = TWITTER.get_user(username)

        tweets = twitter_user.timeline(count=200,
                            exclude_replies=True,
                            include_rts=False,
                            tweet_mode='extended')
        
        embeddings = vectorize_tweet(nlp, tweets[0].full_text)

    except Exception as e:
        print(f'Error while processing {username}: {e}')
    
    return tweets, embeddings

#### Load and Save SpaCy Model

In [20]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()
nlp.to_disk('../spacy_sm_model/')