In [79]:
import requests, time, re, random,hashlib
from bs4 import BeautifulSoup
import tweepy
from nltk import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

### Import the utilities here

In [80]:
import utilities

### We will create a class called Company to hold all the information we have about each company

In [81]:
class Company(object):

    def __init__(self, name):
        self.name = name
        self.articles = None
        self.tweets = None
        self.articles_sentiments = None
        self.tweets_sentiments = None
        self.investment = None
        
    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__)


### Now let's get some tweets

In [82]:
def get_tweets(company_name, n=10):
    
    '''Takes a company name, returns a list of words in the tweets DO NOT MODIFY THIS FUNCTION '''
    
    api = create_twitter_api()
    tweets = api.search(q=company_name,lang='en', show_user=True, count=n)
    tweet_words = get_tweets_bag_of_words(tweets)
    return tweet_words

In [83]:
def create_twitter_api():
    
    '''
    Creates a twitter api client wrapper
    create_twitter_api takes no parameters but creates and returns a twitter API wrapper
    that you can use to interact with Twitter. You will copy and paste your credentials obtained in
    the first step here. 

    TODO: ERASE THEM WHEN YOU TURN IN THE ASSIGNMENT.
    '''
    
    # Variables that contains the user credentials to access Twitter API 
    ACCESS_TOKEN = ''
    ACCESS_SECRET = ''
    CONSUMER_KEY = ''
    CONSUMER_SECRET = ''
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
    api = tweepy.API(auth)
    
    return api

In [130]:
def get_tweets_bag_of_words(tweets):
    
    '''
    Takes the result returned from twitter search, returns a list of words in the tweets

    get_tweet_bag_of_words takes the tweet search results and returns a list of words. You
    will need to parse the tweet search results in order to get the twitter status texts, get rid of the
    hashtag and other punctuations, and find a list of words. See what’s being returned:
    https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets

    Parameters
    ----------
    tweets : dict
        a dict that contains the result returned from the twitter api

    Returns
    -------
    words : list
        a list of words that are strings 

    '''
    
    words = []
    
    tmp = []
    for tweet in tweets:
        tmp += tweet._json['text'].split(" ")
    
    for word in tmp:
        if word.isalpha() and not 'http' in word:
            words.append(word)
    return words

### Now let's fetch some articles about this company

In [131]:
def get_article(company_name, n=3):
    
    '''
    DO NOT MODIFY THIS FUNCTION
    
    Take a company name, returns a list of words in articles associated with the company.'''
    
    results = get_search_results(company_name)
    articles = get_articles_from_search_results(results,n)
    articles_bag_of_words = get_articles_bag_of_words(articles)
    
    return articles_bag_of_words

In [132]:
def get_search_results(company_name):
    
    '''
    Return an html with search results for a given company.
    get_search_results takes the name of the company and returns the search result page of USA
    News Today of the company. Below are some search result links for different terms.
    
    In technical terms: observe the pattern, recreate your own search link, and use fetch
    function we imported to get the html content.
    https://www.usatoday.com/search/facebook/
    
    Parameters
    ----------
    company_name : string
        the company name in the form of a string

    Returns
    -------
    result : string
        string that represents the html result
    
    '''
    url = "https://www.usatoday.com/search/" + company_name + "/"
    result = utilities.fetch(url)

    return result

In [133]:
def get_articles_from_search_results(results_html, n):
    
    '''
    Return a list of n article htmls for given search results html.

    get_articles_from_search_results takes the search result page, get all the links to the news
    articles, and opens n links (default to 3 articles: for testing we suggest you use a small number
    because you will get your results faster and you are not aggressively scraping content and
    risking getting banned). Not all links returned from the page will be links to articles, so your
    function should ensure the links selected are in fact articles from the company search. In
    technical terms: find all the links to articles on the page (<a> using BeautifulSoup), and
    use fetch function we imported to get the html content of each link, store the article
    htmls, return a list of html.

    Parameters
    ----------
    results_html : string
        the html of a search result

    n : int
        the number of links we are going to extract from the results_html
        
    Returns
    -------
    articles : list
        a list of strings that represents the html result from links we've found 
        
    '''
    
    domain = 'http://www.usatoday.com'
    articles = []
    
    soup = BeautifulSoup(results_html)
    links = []
    for link in soup.find_all('a', href=True):
        if link['href'][0:7] == '/story/':
            links.append(link['href'])
    
    for idx, link in enumerate(links):
        if idx + 1 > n:
            break
        else:
            articles.append(utilities.fetch(domain+link))
    
    # excludes any video, audio results
    # search result links are relevant links, convert them to absolute links by adding the domain

    return articles

In [134]:
def get_articles_bag_of_words(articles):
    
    '''
    Return a list of words from a list of article htmls.

    get_articles_bag_of_words(articles) takes the html from the articles and return a list of words. In
    technical terms: for each article, find all the texts on the page (<p> using BeautifulSoup),
    extract words, clean words, return a list of words.
    
    Parameters
    ----------
    articles : list
        a list of strings that represents the html results
        
    Returns
    -------
    bag_of_words : list
        a list of strings that represents all of cleaned words we've found from the html results    
    
    '''
    
    bag_of_words = []
    for article in articles:
        # extract words from html and append to bag of words here
        soup = BeautifulSoup(article)
        for idx, para in enumerate(soup.find_all('p')):
            #             print(para.text)
            #             print(para.text.split(" "))
            for word in para.text.split(" "):
                if word[0:4] == 'http':
                    continue
                if word[-2:-1] == ':':
                    continue
                if word.isalpha():
                    if not 'http' in word:
                        bag_of_words.append(word)

    return bag_of_words

In [135]:
def get_sentiment(words):
    
    '''
    DO NOT MODIFY THIS FUNCTION
    
    Takes a list of words and returns a sentimental score. 
    
    '''
    
    analyzer = SentimentIntensityAnalyzer()
    total_compound = 0
    
    for w in words:
        score = analyzer.polarity_scores(w)
        total_compound += score['compound']
        
    return total_compound

In [136]:
def get_decision(tweets_sentiments, articles_sentiments):
    
    '''
    DO NOT MODIFY THIS FUNCTION
    
    Takes two sentimental scores from tweets and articles and return a final financial decision. 
    Invest if total sentiment is positive. 
    
    '''
    
    return (tweets_sentiments + articles_sentiments > 0)

### Results

In [137]:
company_name = 'amazon'
company = Company(company_name)

In [138]:
company.tweets = get_tweets(company_name, 200)

In [139]:
company.articles = get_article(company_name, 3)

Retrieving from cache: https://www.usatoday.com/search/amazon/
Retrieving from cache: http://www.usatoday.com/story/news/2013/01/09/corrections-clarifications/1821023/
Retrieving from cache: http://www.usatoday.com/story/news/2013/01/09/corrections-clarifications/1821023/
Retrieving from cache: http://www.usatoday.com/story/tech/talkingtech/2017/11/09/netflix-amazon-and-hulu-remain-top-3-subscription-streaming-services-hbo-moves-up/847440001/




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


In [140]:
company.tweets_sentiments = get_sentiment(company.tweets)

In [141]:
company.articles_sentiments = get_sentiment(company.articles)

In [142]:
company.investment = get_decision(company.tweets_sentiments, company.articles_sentiments)

In [143]:
print(company)

