In [None]:
# Dependencies
import tweepy
import json
import numpy as np
from config import consumer_key, consumer_secret, access_token, access_token_secret
from datetime import datetime
import pandas as pd
from nltk.tokenize import word_tokenize
import re

In [None]:
# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
# Twitter API Keys
consumer_key = consumer_key
consumer_secret = consumer_secret
access_token = access_token
access_token_secret = access_token_secret

In [None]:
# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

In [None]:
# Lists to hold sentiments
compound_list = []
positive_list = []
negative_list = []
neutral_list = []
tweet_text = []
tweet_times = []
tweet_id = []
time_diffs = []
tweet_user = []
tweet_handle = []
tweet_followers = []
tweet_language = []

In [None]:
top_stocks = pd.read_csv("NYSE_20180622_top50.csv")
target_terms = top_stocks["Description"].tolist()
#target_terms = ["#Cloudera","Arena Pharmaceuticals"]


In [None]:
time_between_tweets = []
tweet_searchterm = []

for term in target_terms:
    
    public_tweets = api.search(term,count = 100, lang="en")
    
    # Loop through all tweets
    for tweet in public_tweets["statuses"]:
        tweet_searchterm.append(term)
        print(term)
        tweet_text.append(tweet["text"])
        tweet_id.append(tweet["id"])
        print(tweet["id"])
        tweet_user.append(tweet["user"]["name"])
        tweet_handle.append(tweet["user"]["screen_name"])
        tweet_followers.append(tweet["user"]["followers_count"])
        tweet_language.append(tweet["user"]["lang"])
        tweet_times.append(tweet["created_at"])
        
        

        # Run Vader Analysis on each tweet
        compound = analyzer.polarity_scores(tweet["text"])["compound"]
        compound_list.append(compound)
        
        pos = analyzer.polarity_scores(tweet["text"])["pos"]
        positive_list.append(pos)
        
        neu = analyzer.polarity_scores(tweet["text"])["neu"]
        neutral_list.append(neu)
        
        neg = analyzer.polarity_scores(tweet["text"])["neg"]
        negative_list.append(neg)
 

    # Add each datetime object into the array
    tweet_time_objects = []
    for x in range(len(tweet_times)):
        tweet_datetime = datetime.strptime(tweet_times[x], "%a %b %d %H:%M:%S %z %Y")
        tweet_time_objects.append(tweet_datetime)

    # Calculate the time between tweets
    time_in_between = []
    for x in range(len(tweet_time_objects)-1):
        secs_apart = ((tweet_time_objects[x] - tweet_time_objects[x+1]).seconds) 
        time_in_between.append(secs_apart)    
         
    #add one more row to allow into dataframe(lengths must match)
    time_in_between.append(0)    

  

In [None]:
print(len( tweet_searchterm))
print(len( tweet_user))
print(len( tweet_handle))
print(len( tweet_followers))
print(len(  tweet_language))
print(len( tweet_id))
print(len(  tweet_text))
print(len( tweet_time_objects))
print(len( time_in_between))
print(len(  compound_list))
print(len(  positive_list))
print(len(negative_list))
print(len(  neutral_list))

In [None]:
#"Search Term": term_list,

tweet_data = pd.DataFrame({
                            'Name':tweet_user,
                           'Handle':tweet_handle,
                           'Followers':tweet_followers,
                           'Language' : tweet_language,
                            'Id':tweet_id,
                            'Text': tweet_text,
                           'Time Stamp':tweet_time_objects,
                           'Time Delta (seconds)':time_in_between,
                           'Compound' : compound_list,
                           'positive': positive_list,
                           'negative': negative_list,
                           'neutral': neutral_list})


In [None]:
for index, row in tweet_data.iterrows():
    
    tweet_data["Date"] = datetime.date(row["Time Stamp"])
    


In [None]:
# Store the Average Sentiments
sentiment = {"Compound": np.mean(compound_list),
             "Positive": np.mean(positive_list),
             "Neutral": np.mean(neutral_list),
             "Negative": np.mean(negative_list)}



In [None]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

tweet = 'RT @marcobonzanini: just an example! :D http://example.com #NLP'
print(preprocess(tweet))
# ['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']


In [None]:
tweet_text_breakdown = []
tweet_text_id = []

for index, row in tweet_data.iterrows():
    text_seperate = preprocess(row["Text"])
    tweet_text_breakdown.append(text_seperate)
    text_seperate_id = row["Id"]
    tweet_text_id.append(text_seperate_id)

text_breakdown_df = pd.DataFrame({'Id':tweet_text_id,
                                   'Text Breakdown':tweet_text_breakdown})
   

In [None]:
tweet_data_all = pd.merge(tweet_data,text_breakdown_df[["Id","Text Breakdown"]], on="Id",how="left")

tweet_data.to_csv("tweet_data_all.csv")

tweet_data_all.head(5)