In [3]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import datetime as dt
from sklearn.preprocessing import StandardScaler
import pandas_datareader.data as web
import math
import xlsxwriter
import os
import fnmatch

  from pandas.util.testing import assert_frame_equal


In [19]:
class TweetPreprocessing:
    PATH = "Tweets/"
    def __init__(self,s):
        self.stock = s
        
    def start(self):
        self.getTweetData()
        self.preprocessingTweet()
        self.getYahooData()
        self.preprocessingYahoo()
        self.processingBoth()
        self.df_full.to_csv('Clean_Data/${}_pred.csv'.format(self.stock))
    
    def getTweetData(self):
        xls = pd.ExcelFile(self.PATH+'{}.xlsx'.format(self.stock))
        
        df = pd.read_excel(xls, header = 0,encoding='latin-1', sheet_name = "Stream")
        Tweet = df['Tweet content']
        df_results = pd.DataFrame(self.sentimentScore(Tweet))
        
        # Combining the two dataframes
        df_tweets = pd.merge(df, df_results, left_index=True, right_index=True)
        
        # Choose the common range for the dataframes to be used for all tweet data
        df_tweets = df_tweets[(df_tweets['Date'] >= '2016-03-28') & (df_tweets['Date'] <= '2016-06-15')]
        
        # Adding a datetime column
        df_tweets['datetime'] = pd.to_datetime(df_tweets['Date']) # change of Date column to datetime columns        
        # Slimming down the stream into a dataframe with only relevant columns
        self.df_tweet = df_tweets[['Favs','RTs','Followers','Following', 'Is a RT','compound','neg','neu','pos','datetime']]
        
    def preprocessingTweet(self):
        # Remove tweets were compound is zero, i.e. sentiment is neutral
        self.df_tweet = self.df_tweet[(self.df_tweet[['compound']] != 0).all(axis=1)]
        
        # Create new column with the 'compound' multiplied by nr of followers of the account
        self.df_tweet['Compound_multiplied'] = self.df_tweet['compound'] * self.df_tweet['Followers']
        
        # Remove rows where 'Followers' is NaN
        self.df_tweet.dropna(subset=['Followers'],inplace=True)
        
        # Scale the column "Compound_multiplied"
        x_1 = self.df_tweet[['Compound_multiplied']].values.astype(float)

        scaler = StandardScaler().fit(x_1)

        scaled_data = scaler.transform(x_1)

        self.df_tweet['Compound_multiplied_scaled'] = scaled_data
        
        # Create a dataframe with daily MEANS of each column
        self.df_daily_mean=(self.df_tweet.groupby(self.df_tweet.datetime).mean())
        
        # Remove weekends from df_daily_mean
        self.df_daily_mean = self.df_daily_mean[self.df_daily_mean.index.dayofweek < 5]
        
    def getYahooData(self):
        #import pandas_datareader.data as web
        start = dt.datetime(2016, 3, 28)
        end =  dt.datetime(2016, 6, 14) #dt.datetime.now()

        self.df_stock = web.DataReader(self.stock, 'yahoo', start, end)
                
    def preprocessingYahoo(self):
        # measure of volatility
        self.df_stock['HiLo_vola_stock'] = (self.df_stock['High'] - self.df_stock['Low']) / self.df_stock['Adj Close'] * 100.0
        
        # daily percent change
        self.df_stock['Pct_change_stock'] = (self.df_stock['Close'] - self.df_stock['Open']) / self.df_stock['Open'] * 100.0
        
    def processingBoth(self):
        self.df_full = pd.concat([self.df_stock[['Volume','Adj Close','HiLo_vola_stock','Pct_change_stock']],
                             self.df_daily_mean], axis=1, sort=False)
        
        # Impute missing data with their means
        self.df_full[['Favs','RTs','Followers','Following','Is a RT','compound','neg','neu','pos','Compound_multiplied','Compound_multiplied_scaled']] \
        .fillna(value=self.df_full[['Favs','RTs','Followers','Following','Is a RT','compound','neg','neu','pos','Compound_multiplied','Compound_multiplied_scaled']].mean(),inplace=True)

        # Remove missing Pct_change_stock data
        self.df_full.dropna(subset=['Pct_change_stock'],inplace=True) 
        self.df_full.dropna(subset=['Compound_multiplied_scaled'],inplace=True) 
                
        buy_or_sell = []
        for i in self.df_full['Pct_change_stock']:
            if i >= 0:
                buy_or_sell.append(1)
            elif i < 0:
                buy_or_sell.append(-1)
            else:
                buy_or_sell.append(np.nan)

        #Adds -1 or +1 to the column based on if 'Predicted_change' is negative or positive
        self.df_full['Buy/Sell'] = buy_or_sell

        # The 'Buy/Sell' values need to be shifted up one row to match the 'Predicted_change' values
        self.df_full['Buy/Sell'] = self.df_full['Buy/Sell'].shift(-1)
        
        self.df_full.drop(columns=['Pct_change_stock','compound','Compound_multiplied'],inplace = True)
        
    def sentimentScore(self, Tweet):
        analyzer = SentimentIntensityAnalyzer()
        results = []
        for sentence in Tweet:
            vs = analyzer.polarity_scores(sentence)
#             print("Vader score: " + str(vs))
            results.append(vs)
        return results
    
    

In [12]:
txt = []
for root, dirs, files in os.walk('./Tweets'):
    for _file in files:
        if fnmatch.fnmatch(_file,'*.xlsx'):
            txt.append(_file.split('.')[0])
txt

['AAL',
 'ADP',
 'CERN',
 'CSCO',
 'EA',
 'EBAY',
 'EXPE',
 'FISV',
 'TXN',
 'WDC',
 'TMUS']

In [None]:
for ticker in txt:
    temp = TweetPreprocessing(ticker)
    temp.start()

For 2020 data

In [5]:
class TweetPreprocessing:
    PATH = "Tweets/"
    def __init__(self,s):
        self.stock = s
        
    def start(self):
        self.getTweetData()
        self.preprocessingTweet()
        self.getYahooData()
        self.preprocessingYahoo()
        self.processingBoth()
        self.df_full.to_csv('${}_backtest.csv'.format(self.stock))
    
    def getTweetData(self):
        df = pd.read_csv('2020AAL.csv',encoding='latin-1')
    
        Tweet = df['Tweet content']
        df_results = pd.DataFrame(self.sentimentScore(Tweet))
        
        # Combining the two dataframes
        df_tweets = pd.merge(df, df_results, left_index=True, right_index=True)
        
        # Choose the common range for the dataframes to be used for all tweet data
#         df_tweets = df_tweets[(df_tweets['Date'] >= '2016-03-28') & (df_tweets['Date'] <= '2016-06-15')]
        
        # Adding a datetime column
        df_tweets['datetime'] = pd.to_datetime(df_tweets['Date']) # change of Date column to datetime columns        
        # Slimming down the stream into a dataframe with only relevant columns
      
        self.df_tweet = df_tweets[['Favs','RTs','Followers','Following', 'Is a RT','compound','neg','neu','pos','datetime']]
        
    def preprocessingTweet(self):
        # Remove tweets were compound is zero, i.e. sentiment is neutral
        self.df_tweet = self.df_tweet[(self.df_tweet[['compound']] != 0).all(axis=1)]
        
        # Create new column with the 'compound' multiplied by nr of followers of the account
        self.df_tweet['Compound_multiplied'] = self.df_tweet['compound'] * self.df_tweet['Followers']
        
        # Remove rows where 'Followers' is NaN
        self.df_tweet.dropna(subset=['Followers'],inplace=True)
        
        # Scale the column "Compound_multiplied"
        x_1 = self.df_tweet[['Compound_multiplied']].values.astype(float)

        scaler = StandardScaler().fit(x_1)

        scaled_data = scaler.transform(x_1)

        self.df_tweet['Compound_multiplied_scaled'] = scaled_data
        
        # Create a dataframe with daily MEANS of each column
        self.df_daily_mean=(self.df_tweet.groupby(self.df_tweet.datetime).mean())
        
        # Remove weekends from df_daily_mean
        self.df_daily_mean = self.df_daily_mean[self.df_daily_mean.index.dayofweek < 5]
        
    def getYahooData(self):
        #import pandas_datareader.data as web
        start = dt.datetime(2020, 9, 28)
        end =  dt.datetime(2020, 10, 16) #dt.datetime.now()

        self.df_stock = web.DataReader(self.stock, 'yahoo', start, end)
                
    def preprocessingYahoo(self):
        # measure of volatility
        self.df_stock['HiLo_vola_stock'] = (self.df_stock['High'] - self.df_stock['Low']) / self.df_stock['Adj Close'] * 100.0
        
        # daily percent change
        self.df_stock['Pct_change_stock'] = (self.df_stock['Close'] - self.df_stock['Open']) / self.df_stock['Open'] * 100.0
        
        
    def processingBoth(self):
        self.df_full = pd.concat([self.df_stock[['Volume','Adj Close','HiLo_vola_stock','Pct_change_stock']],
                             self.df_daily_mean], axis=1, sort=False)
        
        # Impute missing data with their means
        self.df_full[['Favs','RTs','Followers','Following','Is a RT','compound','neg','neu','pos','Compound_multiplied','Compound_multiplied_scaled']] \
        .fillna(value=self.df_full[['Favs','RTs','Followers','Following','Is a RT','compound','neg','neu','pos','Compound_multiplied','Compound_multiplied_scaled']].mean(),inplace=True)

        # Remove missing Pct_change_stock data
        self.df_full.dropna(subset=['Pct_change_stock'],inplace=True) 
        self.df_full.dropna(subset=['Compound_multiplied_scaled'],inplace=True) 
                
        buy_or_sell = []
        for i in self.df_full['Pct_change_stock']:
            if i >= 0:
                buy_or_sell.append(1)
            elif i < 0:
                buy_or_sell.append(-1)
            else:
                buy_or_sell.append(np.nan)

        #Adds -1 or +1 to the column based on if 'Predicted_change' is negative or positive
        self.df_full['Buy/Sell'] = buy_or_sell

        # The 'Buy/Sell' values need to be shifted up one row to match the 'Predicted_change' values
        self.df_full['Buy/Sell'] = self.df_full['Buy/Sell'].shift(-1)
        
        self.df_full.drop(columns=['Pct_change_stock','compound','Compound_multiplied'],inplace = True)
        
    def sentimentScore(self, Tweet):
        analyzer = SentimentIntensityAnalyzer()
        results = []
        for sentence in Tweet:
            vs = analyzer.polarity_scores(sentence)
#             print("Vader score: " + str(vs))
            results.append(vs)
        return results
    
    

In [6]:
ticker = 'AAL'
temp = TweetPreprocessing(ticker)
temp.start()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
