In [1]:
# import the necessary modules.

import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import words, stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.metrics import mean_absolute_error

# name the sets of words I will be using for NLP.

stop_words = set(stopwords.words("english"))
english_words = set(words.words())

# name the lemmatizing function.

lemmatizer = WordNetLemmatizer()

In [2]:
# Here we create a function, 'text_preprocessor', for which you input a tweet (or other piece of text)
# and it produces a list of words and their respective frequencies of occurance in the text.

def text_preprocessor(text):
    
    # tokenize the text into words and change all letters to lowercase.
    
    lowercase_text = text.lower()
    tokenized_text = word_tokenize(lowercase_text)

    # remove stopwords if members of the 'stopwords' corpus.
    
    filtered_text = []

    for w in tokenized_text:
        if w not in stop_words:
            filtered_text.append(w)
        
    # lemmatize each word.
    
    lemmatized_text = []

    for w in filtered_text:
        lemmatized_text.append(lemmatizer.lemmatize(w))
    
    # remove non-words if not members of the 'words' corpus.
    
    filtered_english_text = []

    for w in lemmatized_text:
        if w in english_words:
            filtered_english_text.append(w)
            
    # count the frequency of each word.
    
    counted_text = Counter(filtered_english_text)
    
    return counted_text

In [3]:
# My Naive Bayes model has six parameters. The following function, 'model_trainer' has the training data 
# as its input and outputs these six values. Firstly, I define the function 'score_counter' which I will use
# to count the number of positive and negative pieces of text in the training data.

def score_counter(score, vector):
    
    return len([r for r in vector if r == score])


# define the 'model_trainer' function. 
# This function uses both the predefined 'text_preprocessor' and 'score_counter' functions.


def model_trainer(training_data):
    
    # seperate the rows into positive and negative sentiment.
    
    positive_rows = training_data.loc[training_data['Sentiment'] == 1]
    negative_rows = training_data.loc[training_data['Sentiment'] == 0]
    
    # consider only the text columns and reindex both dataframes.
    
    negative_text = negative_rows['SentimentText']
    indexed_negative_text = negative_text.reset_index(drop=True)
    
    positive_text = positive_rows['SentimentText']
    indexed_positive_text = positive_text.reset_index(drop=True)
    
    # create a vector of all the sentiment polarities in the training data.
    
    data_polarities = training_data['Sentiment'].tolist()
    
    # using the 'score_counter' function count the number of positive and negative entries.

    positive_count = score_counter(1, data_polarities)
    negative_count = score_counter(0, data_polarities)

    # compile all positive and negative text into single strings for counting.
    
    complete_positive_text = ""
    complete_negative_text = ""

    for i in range(0, positive_count):
        complete_positive_text = complete_positive_text + " " + indexed_positive_text[i]

    for i in range(0, negative_count):
        complete_negative_text = complete_negative_text + " " + indexed_negative_text[i]
    
    # apply our text preprocessor to the aggregated pieces of text to give total word counts.
    
    positive_counts = text_preprocessor(complete_positive_text)
    negative_counts = text_preprocessor(complete_negative_text)
    
    # calculate the Naive probabilities of positive and negative sentiments respectively.
    
    positive_prob = positive_count / len(data_polarities)
    negative_prob = negative_count / len(data_polarities)
    
    # create a vector of parameters to output.
    
    parameters = [positive_counts, positive_prob, positive_count, negative_counts, negative_prob, negative_count]
    
    return parameters

In [4]:
# Here we define a function that inputs the text and three of the parameters (which all relate to either the
# positive or negative class). It ouputs a number proportional to the probability that the text lies in that class.
# This function uses the predefined 'text_preprocessor' function.

def class_prob_predictor(text, class_counts, class_prob, class_count):
    
    # set the initial prediciton to be 1.
    
    prediction = 1

    # preprocess the input text.

    text_counts = text_preprocessor(text)

    # we now correct our prediction using the Naive Bayes algorithm.
    
    # For each word in the text, we take the number of times that word occured in our training data for the 
    # given class, add 1 to 'smooth' the value, and then divide by the total number of words in that class 
    # (plus the class_count to smooth the denominator).
    
    # Smoothing is necessary to ensure that we don't multiply the prediction by 0 if the word doesn't exist 
    # in our training data. Smoothing the denominator counts ensures that we keep things even.
    
    for word in text_counts:
    
        prediction *=  text_counts.get(word) * ((class_counts.get(word, 0) + 1) / (sum(class_counts.values()) + class_count))
        
  # multiply by the probability of the class existing in the training_data.
    
    relative_prob_of_class = prediction * class_prob
    
    return relative_prob_of_class

In [5]:
# Define a function 'text_classifier' in which you input a piece of text and the parameters of the model
# and it outputs the sentiment polarity of that piece of text, based on the model.
# This function uses the predefined 'class_prob_predictor' function.

def text_classifier(text, parameters):
    
    # calculate relative probabilies of the class of the text.
    
    relative_prob_positive = class_prob_predictor(text, parameters[0], parameters[1], parameters[2])
    relative_prob_negative = class_prob_predictor(text, parameters[3], parameters[4], parameters[5])
    
    # calculate normalising constant as the sum of these 'relative' probabilities.
    
    normalising_constant = relative_prob_positive + relative_prob_negative
    
    # normalise the values to give actual probabilities.
    if normalising_constant > 0:
        
        normalised_prob_positive = relative_prob_positive / normalising_constant
        normalised_prob_negative = relative_prob_negative / normalising_constant
    
    # return the polarity, defined to be the probability that the text is positive.
    
        polarity = normalised_prob_positive
    
        return polarity
    
    else:
        return relative_prob_positive
    
# this 'if' statement was added to avoid dividing by zero which gave an error when using some of the tweet data.

In [6]:
# Define a function 'binary_text_classifier' in which you input a piece of text and the parameters of the model
# and it outputs a 0 for positive and a 1 for negative. This will be used to test the model's accuracy.
# This function uses the predefined 'text_classifier' function.

def binary_text_classifier(text, parameters):
    
    polarity = text_classifier(text, parameters)
    
    if polarity >= 0.5:
        return 1
    
    else:
        return 0

In [7]:
# import the data.

csv_file = 'train.csv'
import_data = pd.read_csv(csv_file, encoding='latin-1')
sentiment_data = import_data[['Sentiment', 'SentimentText']].copy()

# split the data into training and testing data.

training_data = sentiment_data[0:74999]
testing_data = sentiment_data[75000:75499].reset_index(drop=True)

In [8]:
# build the model with my model trainer function and the training data.
# The 'my_model' function uses the predefined 'text_classifier' function.

parameters = model_trainer(training_data)

def my_model(text):
    polarity = text_classifier(text, parameters)
    return polarity

In [9]:
# put an example sentence into the model to test it.

my_model("#bitcoin is looking GREAT right now - Buy! Buy! Buy!")

0.8721350768536257

In [10]:
# Define a function which tests the quality of the model. It takes as input the parameters and testing_data
# and returns the success rate of the model as a percentage.
# This function uses the predefined 'binary_text_classifier'.

def model_tester(parameters, test_data):
    
    pd.options.mode.chained_assignment = None
    
    # create a new column to tabulate the predictions.
    
    testing_data_copy = test_data.copy()
    testing_data_copy['Prediction'] = test_data['Sentiment'].copy()
    
    # for each row, calculate the polarity of the text based on my model.

    range_max = len(testing_data_copy.Prediction)

    for i in range(0,range_max):
        testing_data_copy['Prediction'][i] = binary_text_classifier(test_data['SentimentText'][i], parameters)
    
    # calculate the success rate of the model and print.
    
    success_rate = 100*(1 - mean_absolute_error(testing_data_copy['Sentiment'], testing_data_copy['Prediction']))
    print("success rate:", success_rate, "%")
    
    return success_rate

In [11]:
# testing the quality of the model.

model_tester(parameters, testing_data)

# change this so it is cross validated.

success rate: 74.74949899799599 %


74.74949899799599

In [12]:
# import our data file for the tweets.

csv_file = 'bitcoin.csv'

# read the csv file as a pandas dataframe.

twitter_data = pd.read_csv(csv_file)

# specify the columns to use.

twitter_data2 = twitter_data[['timestamp', 'text', 'retweets', 'favorites']].copy()

In [13]:
# import the textblob module for Natural Language Processing (to obtain the subjectivity scores).

from textblob import TextBlob

# create 'polarity' and 'subjectivity' columns using textblob to analyse the text.
# 'polarity' measures the positivity or negativity of the text, on a scale from -1 to 1.
# 'subjectivity' measures how factual the text appears to be, on a scale from 0 to 1.

twitter_data2['polarity']=twitter_data2['text'].apply(lambda tweet: my_model(tweet))
twitter_data2['subjectivity']=twitter_data2['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)

# this has now output both the polarity and subjectivity of each tweet into additional columns.

In [14]:
# drop the 'text' column as we have retrieved the sentiment from it.

twitter_data3 = twitter_data2.drop('text', axis=1)

# replace the column names with clearer titles.

twitter_data3.columns = ['Date', 'Retweets', 'Favourites', 'Polarity', 'Subjectivity']

In [15]:
# the data gives the dates and times in an unusable format.
# convert the timestamps into DateTime format.

twitter_data3['Date'] =  pd.to_datetime(twitter_data3['Date'], errors='coerce')

In [16]:
# remove any DateTimes which are null.

twitter_data4 = twitter_data3[twitter_data3.Date.notnull()] 

# use the 'resample' function to replace the DateTimes with Dates.
# this reduces the number of rows and replaces the values in each column with daily averages.

twitter_data5 = twitter_data4.resample('d', on='Date').mean().dropna(how='all')

# rename the columns as "Average Polarity" and "Average Subjectivity".

twitter_data5.columns = ['Av. Polarity', 'Av. Subjectivity']

In [17]:
# import and read our data file for the daily BTC price.
# this data includes Price, Open, Daily High, Daily Low, Volume and Percentage Change, but we will use
# the % change as our y variable for predictions.

csv_file2 = 'BTC_USD Bitfinex Historical Data-2.csv'
btc_data = pd.read_csv(csv_file2)

# rename variables, restricting to only necessary columns.

btc_data2 = btc_data[['Date', 'Price', 'Vol.', 'Change %']].copy()

# put the 'Date' column into DateTime format.

btc_data2['Date'] =  pd.to_datetime(btc_data2['Date'], errors='coerce')

In [18]:
# create our final DataFrame by merging the two DataFrames and matching based on Date.
# this is now possible as both use the DateTime format.
# We also drop rows with null values (as these arrise from errors in the way the data was recorded).

merged_data = pd.merge(twitter_data5, btc_data2, how='outer', on='Date')
merged_data2 = merged_data.dropna(how='any')

In [19]:
# before we can use SciKit Learn's data science tools, the data needs to be readable numerically
# by these models, meaning there cannot be commas, 'k's to represent thousands, or percentage signs.

final_data = merged_data2.copy()
final_data['Price'] = merged_data2['Price'].str.replace(',', '')
final_data['Vol.'] = merged_data2['Vol.'].str.replace('K', '')
final_data['Change %'] = merged_data2['Change %'].str.replace('%', '')

# 'final_data' now contains our fully processed data ready for training the model.
print(final_data)

         Date  Av. Polarity  Av. Subjectivity   Price    Vol. Change %
6  2018-03-13      0.530616          0.187391  9135.0   61.54     0.15
7  2018-03-14      0.537747          0.191272  8186.6   78.81   -10.38
8  2018-03-15      0.541601          0.222260  8252.9   82.58     0.81
9  2018-03-16      0.515665          0.180636  8251.0   56.34    -0.02
10 2018-03-17      0.502545          0.202575  7851.0   48.97    -4.85
11 2018-03-18      0.549517          0.203636  8200.2   88.92     4.45
12 2018-03-19      0.529867          0.217948  8600.1   73.28     4.88
13 2018-03-20      0.519701          0.216737  8899.7   54.28     3.48
14 2018-03-21      0.530689          0.199832  8900.1   42.86     0.00
15 2018-03-22      0.534268          0.222412  8706.4   54.42    -2.18
16 2018-03-23      0.511127          0.230296  8908.0   44.91     2.32
17 2018-03-24      0.526595          0.229707  8535.0   44.21    -4.19
18 2018-03-25      0.525538          0.216278  8445.1   30.69    -1.05
19 201

In [20]:
# create a graph to show sentiment over time alongside BTC Price over time

import plotly
plotly.tools.set_credentials_file(username='jpetersgill', api_key='8DmzeEcfwFaMHtUGHaSB')

import plotly.plotly as py
import plotly.graph_objs as go

x_variable1 = final_data['Date']
y_variable1 = final_data['Av. Polarity']

# Create the first trace
trace1 = go.Scatter(
    x = x_variable1,
    y = y_variable1,
    name = 'Twitter Sentiment'
)

# create the second trace

x_variable2 = final_data['Date']
y_variable2 = final_data['Price']


trace2 = go.Scatter(
    x = x_variable2,
    y = y_variable2,
    name = 'BTC Price',
    yaxis = 'y2'
)

data = [trace1, trace2]


layout = go.Layout(
    title='BTC Price against Daily Average Twitter Sentiment Polarity',
    yaxis=dict(
        title='Sentiment Polarity'
    ),
    yaxis2=dict(
        title='Value (USD)',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='multiple-axes-double')

In [21]:
# insert some correlation analysis here.

In [22]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

y = final_data['Change %']
X1 = final_data.drop('Change %', axis=1)
X2 = X1.drop('Date', axis=1)
X3 = X2.drop('Price', axis=1)
X = X3.drop('Vol.', axis=1)
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

In [23]:
btc_model = DecisionTreeRegressor()

# Fit model

btc_model.fit(train_X, train_y)

# get predicted prices on validation data and output the MAE

val_predictions = btc_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

3.332727272727273


In [24]:
# define a function 'get_mae' which inputs the number of leaf nodes
# and the training and test data, and outputs Mean Average Error in applying
# our model

def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

# compare MAE with differing values of max_leaf_nodes

for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))
    
# import the Random Forest Regressor package

from sklearn.ensemble import RandomForestRegressor

# apply the random forest model and print the MAE

forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
btc_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, btc_preds))

# we see that our best result is obtained using the random forest
# regressor model

Max leaf nodes: 5  		 Mean Absolute Error:  2
Max leaf nodes: 50  		 Mean Absolute Error:  3
Max leaf nodes: 500  		 Mean Absolute Error:  3
Max leaf nodes: 5000  		 Mean Absolute Error:  3
2.2796363636363637


In [25]:
# add XGBoost model

In [26]:
# define the necessary keys for the Twitter API.

API_key = "1JqfUbxuD2NV9uUlID6Ohd1gg"
API_secret_key = "MeMGtg42Sdh1IS4Ai2PM404qDtbic2vYKJWQARgSstprkQCTj3"
access_token = "1225757852-QdwVGzucPhzGDJEwTiBlIIWK0BgTtcqKNYDXKuc"
access_token_secret = "xhAgF1dBPs1cz7TY8VlpWwc4PEE0qrhsHQ8q0Z22dR2ML"

In [27]:
# import tweepy module and authenticate with access token.

import tweepy
from tweepy import OAuthHandler
from tweepy import API
from tweepy import Cursor

auth = tweepy.OAuthHandler(API_key, API_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [28]:
def tweets_to_data_frame(tweets):
    
    df = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Text'])
        
    df['Timestamp'] = np.array([tweet.created_at for tweet in tweets])
    df['No. Likes'] = np.array([tweet.favorite_count for tweet in tweets])
    df['No. Retweets'] = np.array([tweet.retweet_count for tweet in tweets])
    df['Sentiment Polarity'] = np.array([my_model(tweet.text) for tweet in tweets])
    df['Subjectivity'] = np.array([TextBlob(tweet.text).sentiment.subjectivity for tweet in tweets])
    
        
        
    return df


def tweet_fetcher(key_word, num_of_tweets):
    fetched_tweets = []
    for tweet in tweepy.Cursor(api.search, q=key_word+'-filter:retweets', lang='en', rpp=100).items(num_of_tweets):
        
        fetched_tweets.append(tweet)
        pass
    
    df = tweets_to_data_frame(fetched_tweets)
    return df

In [29]:
sample = tweet_fetcher('bitcoin', 100) #this number can be changed at any point!!!

In [30]:
condensed_sample1 = sample.drop('Text', axis=1)
condensed_sample2 = condensed_sample1.drop('Timestamp', axis=1)
condensed_sample3 = condensed_sample2.drop('No. Likes', axis=1)
condensed_sample = condensed_sample3.drop('No. Retweets', axis=1)
mean_pol = condensed_sample['Sentiment Polarity'].mean(axis=0)
mean_subj = condensed_sample['Subjectivity'].mean(axis=0)
d = {'Av. Polarity': [mean_pol], 'Av. Subjectivity': [mean_subj]}
input_data = pd.DataFrame(data=d)
print(input_data)

   Av. Polarity  Av. Subjectivity
0      0.431897          0.328242


In [31]:
my_btc_percentage_change_pred = forest_model.predict(input_data)
print(my_btc_percentage_change_pred)

[-4.791]
