In [1]:
# Dependencies
import tweepy
import json
import numpy as np
from config import consumer_key, consumer_secret, access_token, access_token_secret
from datetime import datetime
import pandas as pd
from nltk.tokenize import word_tokenize
import re

In [2]:
# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [3]:
# Twitter API Keys
consumer_key = consumer_key
consumer_secret = consumer_secret
access_token = access_token
access_token_secret = access_token_secret

In [4]:
# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser(), wait_on_rate_limit=True)

In [5]:
# Lists to hold sentiments
compound_list = []
positive_list = []
negative_list = []
neutral_list = []

#list to hold tweet data
tweet_text = []
tweet_times = []
unique_ids = []
tweet_user = []
tweet_handle = []
tweet_followers = []
tweet_language = []
tweet_search_keyword = []

# "Real Person" Filters
min_tweets = 5
max_tweets = 10000
max_followers = 2500
max_following = 2500
lang = "en"
ignore_user = ['melaniebower89a','oliverpaige625','victorblake392']


In [6]:
top_stocks = pd.read_csv("SP_100_Tickers.csv")

stock_keywords_description = top_stocks["Name"].tolist()
stock_keywords_symbol = top_stocks["Symbol"].tolist()

stock_keywords = stock_keywords_description


In [7]:
#loop through keywords
for keyword in stock_keywords:
    
    #put tweeter searches in list
    public_tweets = api.search(keyword,count = 100)
    
    
    # Loop through all tweets
    for tweet in public_tweets["statuses"]:
        
        #Use filters to check if user meets conditions
        if (tweet["user"]["followers_count"] < max_followers and
            tweet["user"]["statuses_count"] > min_tweets and
            tweet["user"]["statuses_count"] < max_tweets and
            tweet["user"]["friends_count"] < max_following and
            tweet["user"]["lang"] == lang and not tweet['retweeted'] and 
            'RT @' not in tweet['text'] and 'Binance' not in tweet['text'] and
            tweet["user"]["screen_name"] not in ignore_user):
                        
            tweet_id = tweet["id"]
            
            # if Id is unique tweet data will be added
            if tweet_id not in unique_ids:
                
                unique_ids.append(tweet_id)
            
                tweet_search_keyword.append(keyword)
                tweet_text.append(tweet["text"])
                tweet_user.append(tweet["user"]["name"])
                tweet_handle.append(tweet["user"]["screen_name"])
                tweet_followers.append(tweet["user"]["followers_count"])
                tweet_language.append(tweet["user"]["lang"])
                tweet_times.append(tweet["created_at"])

                # Run Vader Analysis on each tweet and add to lists
                compound = analyzer.polarity_scores(tweet["text"])["compound"]
                compound_list.append(compound)

                pos = analyzer.polarity_scores(tweet["text"])["pos"]
                positive_list.append(pos)

                neu = analyzer.polarity_scores(tweet["text"])["neu"]
                neutral_list.append(neu)

                neg = analyzer.polarity_scores(tweet["text"])["neg"]
                negative_list.append(neg)


            # Add each datetime object into the array
            tweet_time_objects = []
            for x in range(len(tweet_times)):
                tweet_datetime = datetime.strptime(tweet_times[x], "%a %b %d %H:%M:%S %z %Y")
                tweet_time_objects.append(tweet_datetime)



In [8]:
#check to make sure all list are the same lenght
print(len(tweet_search_keyword))
print(len(tweet_user))
print(len(tweet_handle))
print(len(tweet_followers))
print(len(tweet_language))
print(len(unique_ids))
print("---------")
print(len(tweet_text))
print(len(tweet_time_objects))
print(len(compound_list))
print(len(positive_list))
print(len(negative_list))
print(len(neutral_list))

1416
1416
1416
1416
1416
1416
---------
1416
1416
1416
1416
1416
1416


In [9]:
tweet_data = pd.DataFrame({"Search Term": tweet_search_keyword,
                            'Name':tweet_user,
                           'Handle':tweet_handle,
                           'Followers':tweet_followers,
                           'Language' : tweet_language,
                            'Id':unique_ids,
                            'Text': tweet_text,
                           'Time Stamp':tweet_time_objects,
                           'Compound' : compound_list,
                           'positive': positive_list,
                           'negative': negative_list,
                           'neutral': neutral_list})


In [None]:
for index, row in tweet_data.iterrows():
    
    tweet_data["Date"] = datetime.date(row["Time Stamp"])


In [10]:
# Store the Average Sentiments
sentiment = {"Compound": np.mean(compound_list),
             "Positive": np.mean(positive_list),
             "Neutral": np.mean(neutral_list),
             "Negative": np.mean(negative_list)}



In [11]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

tweet = 'RT @marcobonzanini: just an example! :D http://example.com #NLP'
print(preprocess(tweet))
# ['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']


['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']


In [12]:
tweet_text_breakdown = []
tweet_text_id = []

for index, row in tweet_data.iterrows():
    text_seperate = preprocess(row["Text"])
    tweet_text_breakdown.append(text_seperate)
    text_seperate_id = row["Id"]
    tweet_text_id.append(text_seperate_id)

text_breakdown_df = pd.DataFrame({'Id':tweet_text_id,
                                   'Text Breakdown':tweet_text_breakdown})
   

In [13]:
tweet_data_all = pd.merge(tweet_data,text_breakdown_df[["Id","Text Breakdown"]], on="Id",how="left")

tweet_data.to_csv("tweet_data_all.csv")

tweet_data_all.head(5)

Unnamed: 0,Compound,Followers,Handle,Id,Language,Name,Search Term,Text,Time Stamp,negative,neutral,positive,Text Breakdown
0,0.0,507,imnotbegging,1011368146249363456,en,just aj:(,Apple Inc.,"@lizerdgurl PayPal: Mobile Cash by PayPal, Inc...",2018-06-25 21:58:51+00:00,0.0,1.0,0.0,"[@lizerdgurl, PayPal, :, Mobile, Cash, by, Pay..."
1,0.0,58,DonghanTurkey,1011366457563930627,en,Kim Donghan Turkey,Apple Inc.,₍🏆250618₎ ┊THE SHOW oylaması yarın yayınlanıyo...,2018-06-25 21:52:08+00:00,0.0,1.0,0.0,"[₍, 🏆, 250618, ₎, ┊, THE, SHOW, oylaması, yarı..."
2,0.0,80,AppStoreNews24,1011363668716486656,en,App Store News,Apple Inc.,eBay: Discover Summer Deals - eBay Inc. https:...,2018-06-25 21:41:03+00:00,0.0,1.0,0.0,"[eBay, :, Discover, Summer, Deals, -, eBay, In..."
3,0.5267,337,lb_SFO,1011362391613833216,en,DCAFlyer,Apple Inc.,"El CEO de Apple, Tim Cook, marcho ayer en el S...",2018-06-25 21:35:59+00:00,0.079,0.719,0.201,"[El, CEO, de, Apple, ,, Tim, Cook, ,, marcho, ..."
4,-0.2484,1843,CSB,1011359076058574848,en,Comic Strip Blogger,Apple Inc.,My new #cartoon :\n\nSiri and HomePod - the mo...,2018-06-25 21:22:48+00:00,0.158,0.727,0.115,"[My, new, #cartoon, :, Siri, and, HomePod, -, ..."
