![header](../../img/logo.svg)

**NLP Demo**

In [1]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv

---

## Set ticker

In [2]:
# Set tickers
ticker = 'KO'

---

## NLTK sentiment analysis methods

In [3]:
# enter debugging mode
debug = True

In [4]:
# Get company name from ticker
# Source: https://stackoverflow.com/questions/38967533/retrieve-company-name-with-ticker-symbol-input-yahoo-or-google-api
def get_company_name(ticker):
    url = "http://d.yimg.com/autoc.finance.yahoo.com/autoc?query={}&region=1&lang=en".format(ticker)
    result = requests.get(url).json()
    for x in result['ResultSet']['Result']:
        if x['symbol'] == ticker:
            return x['name']

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Instantiate the lemmatizer
lemmatizer = WordNetLemmatizer()

# Create a list of stopwords
sw = set(stopwords.words('english'))

# Expand the default stopwords list if necessary
custom_sw = ['char','tec','inc','say','via','bos']

def parse_corpus(corpus, datatype = str):
    # Tokenize
    if datatype == 'news':
        tokens = [tokenizer(text) for text in corpus['content']]
    elif datatype == 'tweets':
        tokens = [tokenizer(text) for text in corpus]

    # Flatten the list
    flat_tokens = [item for sublist in tokens for item in sublist]
    
    if debug:
        print(f"Number of tokens : {len(flat_tokens)}")

    # String to tokens
    str_tokens = ' '.join(flat_tokens)
    
    return str_tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\illya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\illya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\illya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
from nltk.tokenize import word_tokenize
import re

# Tokenizez method
def tokenizer(text):
    """Tokenizes text."""
    
    # Remove the punctuation from text
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', text)
    
    # Create a tokenized list of the words
    words = word_tokenize(re_clean)
    
    # Lemmatize words into root words
    lem = [lemmatizer.lemmatize(word) for word in words]
   
    # Convert the words to lowercase
    lem_lower = [word.lower() for word in lem]
    
    # Remove the stop words
    tokens_sw = [word for word in lem_lower if word not in sw]
    
    # Remove custom stop words
    tokens_csw = [word for word in tokens_sw if word not in custom_sw]
    
    # Remove all words less than 3 chars long
    tokens = [word for word in tokens_csw if len(word) > 2]
    
    return tokens

## Google Cloud Sentiment Analysis Method

In [7]:
from google.cloud import language_v1
from google.oauth2.credentials import Credentials

# Google Sentiment Analysis requires Google API key
# Directions on how to set up the API key: https://cloud.google.com/docs/authentication/getting-started

def GetSentimentAnalysisGoogle(text_content):
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '../resources/aic_nlp.json'
    client = language_v1.LanguageServiceClient()
    type_ = language_v1.Document.Type.PLAIN_TEXT
    document = {'content': text_content, 'type_': type_}
    encoding_type = language_v1.EncodingType.UTF8
    response = client.analyze_sentiment(request={'document': document, 'encoding_type': encoding_type})
    return {'score' : response.document_sentiment.score , 'magnitude' : response.document_sentiment.magnitude}

---

## News Sentiment

In [8]:
# NewsAPI requries an API key
# Directions on how to get NewsAPI API key: https://newsapi.org/

# load API keys
load_dotenv('../resources/api_keys.env')

# set NewsAPI API key
newsapi_api_key = os.getenv("NEWSAPI_API_KEY")


if debug:
    print(f"API Key Test:")
    print(f"NEWSAPI_API_KEY: {type(newsapi_api_key)}")

API Key Test:
NEWSAPI_API_KEY: <class 'str'>


In [9]:
# Load NewsAPI key
from newsapi import NewsApiClient
newsapi = NewsApiClient(newsapi_api_key)

In [10]:
# Set daterange
# Set market data date range 
from datetime import date, datetime, timedelta

end_date  = datetime.now()
start_date  = (end_date - timedelta(days=60)).strftime('%Y-%m-%d')
end_date = end_date.strftime('%Y-%m-%d')

print(f"Start date : {start_date}")
print(f"End date : {end_date}")

Start date : 2021-05-15
End date : 2021-07-14


In [11]:
# Fetch the Articles
def get_articles(ticker):
    company_name = get_company_name(ticker)

    article_n = 20 #100 maximum allowed by free NewsAPI account
    pages = int(article_n / 20)
    btc_news = []

    # Get 5 pages of articles (100 articles)
    for i in list(range(1,pages+1)):
        articles = newsapi.get_everything(q=company_name, language='en', page=i)['articles']
        btc_news.append(articles)

    corpus = [y for x in btc_news for y in x]

    if debug:
        print(f"Total articles about {company_name}: {len(corpus)}")

    return corpus

In [12]:
# Get all articles
corpus = pd.DataFrame(get_articles(ticker))

Total articles about The Coca-Cola Company: 20


In [13]:
# get token string
news_str_tokens = parse_corpus(corpus, datatype = 'news')

Number of tokens : 4659


In [14]:
# get news sentiment
news_sentiment_score = GetSentimentAnalysisGoogle(news_str_tokens)

In [15]:
print(f"News sentiment score for {get_company_name(ticker)} = {news_sentiment_score['score']}")

News sentiment score for The Coca-Cola Company = -0.20000000298023224


---

## Twitter Sentiment

In [16]:
# Twitter API requires API keys
# Directions to set up Twitter API keys: https://developer.twitter.com/en/docs/getting-started

# load API keys
load_dotenv('../resources/api_keys.env')

# set NewsAPI API key
twitter_api_key = os.getenv("TWITTER_API_KEY")
twitter_secret = os.getenv("TWITTER_API_SECRET_KEY")
twitter_access_token = os.getenv("TWITTER_ACCESS_TOKEN")
twitter_access_token_secret = os.getenv("TWITTER_ACCESS_TOKEN_SECRET")

if debug:
    print(f"API Key Test:")
    print(f"TWITTER_API_KEY: {type(twitter_api_key)}")
    print(f"TWITTER_API_SECRET_KEY: {type(twitter_secret)}")
    print(f"TWITTER_ACCESS_TOKEN: {type(twitter_access_token)}")
    print(f"TWITTER_ACCESS_TOKEN_SECRET: {type(twitter_access_token_secret)}")

API Key Test:
TWITTER_API_KEY: <class 'str'>
TWITTER_API_SECRET_KEY: <class 'str'>
TWITTER_ACCESS_TOKEN: <class 'str'>
TWITTER_ACCESS_TOKEN_SECRET: <class 'str'>


In [17]:
import tweepy

# Twitter tweepy method
def ReturnTwitterData(hashtag,number_tweets):
    auth = tweepy.OAuthHandler(twitter_api_key, twitter_secret)
    auth.set_access_token(twitter_access_token, twitter_access_token_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True)
    hashtag = hashtag + " -filter:retweets"
    number_tweets = number_tweets
    return tweepy.Cursor(api.search, q=hashtag).items(number_tweets)

In [18]:
# get tweets
hashtag = '$'+ticker
tweets = ReturnTwitterData(hashtag,200)

In [19]:
tweet_list = []
for tweet in tweets:
    if tweet.lang == "en":
        text = tweet.text
        text = text.split('https://',1)[0]
        text = text.split(',',1)[0]
        tweet_list.append(text)

In [20]:
# get token string
tweet_str_tokens = parse_corpus(tweet_list, datatype = 'tweets')

Number of tokens : 1125


In [21]:
# get news sentiment
tweet_sentiment_score = GetSentimentAnalysisGoogle(tweet_str_tokens)

In [22]:
print(f"Twitter sentiment score for {get_company_name(ticker)} = {tweet_sentiment_score['score']}")

Twitter sentiment score for The Coca-Cola Company = -0.4000000059604645
