# I - Data preprocessing

## Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed as well as Scikit-Learn (V â‰¥0.20).

In [215]:
# Python â‰¥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn â‰¥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import pandas as pd

# For text procesing
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

analyser = SentimentIntensityAnalyzer()

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
pd.options.mode.chained_assignment = None  # default='warn'

#display all columns
pd.set_option('display.max_columns', None)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Jules\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Get the data

## From GitHub repository

In [216]:
import pandas as pd
import requests
import io
    
# Downloading the csv file from GitHub account

url1 = "https://raw.githubusercontent.com/julesz12345/Cryptocurrency-Prediction/main/Data/Cardano%20Tweets(no%20retweets%20English%20and%20US%20region)%202020.csv"

download1 = requests.get(url1).content

# Reading the downloaded content and turning it into a pandas dataframe
df_2020 = pd.read_csv(io.StringIO(download1.decode('utf-8')))

# Printing out the first 5 rows of the dataframe
df_2020.head()

Unnamed: 0,text,lang,user-location,place-country,place-country_code,location-coordinates,user-screen_name,created_at,quote_count,reply_count,retweet_count,retweeted,is_retweet,is_quote_status,Unnamed: 14
0,Happy new year everyone! Thanks for being a p...,en,"Jackson, Wyoming",,,,TetonBlocks,Thu Dec 31 23:58:47 +0000 2020,0,0,0,False,0,False,
1,Former US Congressional Candidate States Why C...,en,"New York, NY",,,,davidgokhshtein,Thu Dec 31 23:49:21 +0000 2020,6,15,47,False,0,False,
2,I wish I could understand this whale. \n\nI wa...,en,New York,,,,RealSaidov,Thu Dec 31 23:43:21 +0000 2020,0,1,0,False,0,False,
3,Cardano Price Analysis: 31 December https://t....,en,"New York, NY",,,,BTCstJournal,Thu Dec 31 23:35:17 +0000 2020,0,0,1,False,0,False,
4,Our featured article for today! Please don't f...,en,United States,,,,fintechnewshub,Thu Dec 31 23:33:55 +0000 2020,0,0,1,False,0,False,


# Data preprocessing

## 1-Tweets

In [217]:
# Some more information about the dataset
display(df_2020.shape)
display(df_2020.isnull().sum())
display(df_2020.describe())

(7885, 15)

text                       0
lang                       0
user-location            278
place-country           7820
place-country_code      7820
location-coordinates    7820
user-screen_name           0
created_at                 0
quote_count                0
reply_count                0
retweet_count              0
retweeted                  0
is_retweet                 0
is_quote_status            0
Unnamed: 14             7885
dtype: int64

Unnamed: 0,quote_count,reply_count,retweet_count,is_retweet,Unnamed: 14
count,7885.0,7885.0,7885.0,7885.0,0.0
mean,0.565631,1.162207,2.62435,0.0,
std,3.57427,6.296048,12.065562,0.0,
min,0.0,0.0,0.0,0.0,
25%,0.0,0.0,0.0,0.0,
50%,0.0,0.0,0.0,0.0,
75%,0.0,0.0,1.0,0.0,
max,106.0,262.0,305.0,0.0,


## Initial Observations - 2020

- There are 7885 rows and 15 columns.
- `user-location` column have 278 missing values.
- `place-country ` column have 7820 missing values.
- `place-country_code` column have 7820 missing values.
- `location-coordinates` column have 7820 missing values.
- `Unnamed: 14 ` column have 7885 missing values.

In [218]:
# Droping non useful columns
df_2017.drop(columns=['place-country', 'place-country_code', 'location-coordinates', 'Unnamed: 14'])
df_2018.drop(columns=['place-country', 'place-country_code', 'location-coordinates', 'Unnamed: 14'])
df_2020.drop(columns=['place-country', 'place-country_code', 'location-coordinates', 'Unnamed: 14'])

Unnamed: 0,text,lang,user-location,user-screen_name,created_at,quote_count,reply_count,retweet_count,retweeted,is_retweet,is_quote_status
0,Happy new year everyone! Thanks for being a p...,en,"Jackson, Wyoming",TetonBlocks,Thu Dec 31 23:58:47 +0000 2020,0,0,0,False,0,False
1,Former US Congressional Candidate States Why C...,en,"New York, NY",davidgokhshtein,Thu Dec 31 23:49:21 +0000 2020,6,15,47,False,0,False
2,I wish I could understand this whale. \n\nI wa...,en,New York,RealSaidov,Thu Dec 31 23:43:21 +0000 2020,0,1,0,False,0,False
3,Cardano Price Analysis: 31 December https://t....,en,"New York, NY",BTCstJournal,Thu Dec 31 23:35:17 +0000 2020,0,0,1,False,0,False
4,Our featured article for today! Please don't f...,en,United States,fintechnewshub,Thu Dec 31 23:33:55 +0000 2020,0,0,1,False,0,False
...,...,...,...,...,...,...,...,...,...,...,...
7880,#BTC #XTZ #ATOM #ETH #TOP #ADA #LTC #XLM #XRP ...,en,USA,webnowcompany,Thu Oct 01 02:26:36 +0000 2020,0,0,0,False,0,False
7881,E220 Update: \n\nIt is a close race this epoch...,en,New York,RealSaidov,Thu Oct 01 01:54:35 +0000 2020,0,0,1,False,0,False
7882,#LTC #XVG #TPAY #ADA #ETH #BTC #BCH Cardano co...,en,USA,webnowcompany,Thu Oct 01 01:44:41 +0000 2020,0,0,0,False,0,False
7883,Well....no dates in September for #goguen. ðŸ˜•ðŸ˜•\...,en,Earth,TheCryptoviser,Thu Oct 01 01:01:24 +0000 2020,0,3,0,False,0,True


In [219]:
# Text preprocesing

xl_2020 = df_2020['text'].str.lower()

df_2020['text'] = xl_2020

In [220]:
# This steps takes time
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

#2020
xl_2020 = xl_2020.tolist()
for i in range(len(xl_2020)): 
    text_tk = word_tokenize(xl_2020[i])
    s = [word for word in text_tk if not word in stopwords.words()]
    xl_2020[i] = ' '.join([str(elem) for elem in s])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jules\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [221]:
# Creating datasets for processed tweets
df_2020 = df_2020.drop(['text'], axis=1)

df_2020.to_csv('tweet_2020.csv',index=False)

## 2-Cardano

In [222]:
# Downloading the csv file from GitHub account
url2 = "https://raw.githubusercontent.com/julesz12345/Cryptocurrency-Prediction/main/Data/ADA-USD.csv"

download2 = requests.get(url2).content

# Reading the downloaded content and turning it into a pandas dataframe
df_cardano = pd.read_csv(io.StringIO(download2.decode('utf-8')))

# Printing out the first 5 rows of the dataframe
df_cardano.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-10-01,0.101202,0.104311,0.096007,0.097878,0.097878,499813017.0
1,2020-10-02,0.097894,0.099826,0.090471,0.092913,0.092913,602540074.0
2,2020-10-03,0.092913,0.095158,0.092635,0.093684,0.093684,334059453.0
3,2020-10-04,0.093684,0.096791,0.092884,0.096301,0.096301,482409858.0
4,2020-10-05,0.096301,0.098844,0.096255,0.097687,0.097687,432817916.0


In [223]:
df_cardano.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       92 non-null     object 
 1   Open       89 non-null     float64
 2   High       89 non-null     float64
 3   Low        89 non-null     float64
 4   Close      89 non-null     float64
 5   Adj Close  89 non-null     float64
 6   Volume     89 non-null     float64
dtypes: float64(6), object(1)
memory usage: 5.2+ KB


## Initial Observations - Cardano

- There are 92 rows and 7 columns.
- `Open` column have 3 missing values.
- `High` column have 3 missing values.
- `Low` column have 3 missing values.
- `Close` column have 3 missing values.
- `Adj Close` column have 3 missing values.
- `Volume` column have 3 missing values.


# Extracting Processed Data

In [224]:
# Downloading the csv file from GitHub account
url3 = "https://raw.githubusercontent.com/julesz12345/Cryptocurrency-Prediction/main/Data/tweet_2020.csv"

download3 = requests.get(url3).content

# Reading the downloaded content and turning it into a pandas dataframe
tweet2020 = pd.read_csv(io.StringIO(download3.decode('utf-8')))

# Matching date columns
tweet2020.rename(columns={'created_at': 'Date'}, inplace=True)
tweet2020['Date'] = pd.to_datetime(tweet2020['Date'])
tweet2020['Date'] = tweet2020['Date'].dt.strftime('%Y-%m-%d')
tweet2020['Date'] = pd.to_datetime(tweet2020['Date'])
ada['Date'] = pd.to_datetime(ada['Date'])

In [225]:
# Seeing how much price changed in consecutive days (in percentage)
ada = ada.reset_index(drop=True)
ada['Price_Change']=0
for i in range(len(ada)-1):
    ada['Price_Change'][i]=((ada['Close'][i+1] - ada['Close'][i])/ada['Close'][i])*100

In [226]:
ada.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Price_Change,Change
0,2020-10-01,0.101202,0.104311,0.096007,0.097878,0.097878,499813017.0,-5,Decrease
1,2020-10-02,0.097894,0.099826,0.090471,0.092913,0.092913,602540074.0,0,Neutral
2,2020-10-03,0.092913,0.095158,0.092635,0.093684,0.093684,334059453.0,2,Neutral
3,2020-10-04,0.093684,0.096791,0.092884,0.096301,0.096301,482409858.0,1,Neutral
4,2020-10-05,0.096301,0.098844,0.096255,0.097687,0.097687,432817916.0,-4,Neutral


## Sentiment Analysis

In [227]:
# Scoring every tweet and appending it to dataset
i=0
compval1 = [ ]
while (i<len(tweet2020)):
    k = analyser.polarity_scores(tweet2020.iloc[i]['text'])
    compval1.append(k['compound'])  
    i = i+1
compval1 = np.array(compval1)
tweet2020['VADER_score'] = compval1

In [228]:
tweet2020.head()

Unnamed: 0.1,Unnamed: 0,lang,user-location,place-country,place-country_code,location-coordinates,user-screen_name,Date,quote_count,reply_count,retweet_count,retweeted,is_retweet,is_quote_status,Unnamed: 14,text,VADER_score
0,0,en,"Jackson, Wyoming",,,,TetonBlocks,2020-12-31,0,0,0,False,0,False,,happy new year everyone ! thanks part # cardan...,0.8908
1,1,en,"New York, NY",,,,davidgokhshtein,2020-12-31,6,15,47,False,0,False,,former us congressional candidate states carda...,0.8555
2,2,en,New York,,,,RealSaidov,2020-12-31,0,1,0,False,0,False,,wish could understand whale . cardano become r...,0.4019
3,3,en,"New York, NY",,,,BTCstJournal,2020-12-31,0,0,1,False,0,False,,cardano price analysis : 31 december https : /...,0.0
4,4,en,United States,,,,fintechnewshub,2020-12-31,0,0,1,False,0,False,,featured article today ! please n't forget sub...,0.5037


In [229]:
# Daily average aggregated
tweet2020_sentiment = tweet2020.groupby('Date')['VADER_score'].mean().to_frame()
tweet2020_sentiment.reset_index(inplace=True)

In [230]:
tweet2020_sentiment

Unnamed: 0,Date,VADER_score
0,2020-10-01,0.245195
1,2020-10-02,0.221654
2,2020-10-03,0.095984
3,2020-10-04,0.402757
4,2020-10-05,0.280624
...,...,...
87,2020-12-27,0.236708
88,2020-12-28,0.293765
89,2020-12-29,0.305050
90,2020-12-30,0.273837


In [231]:
# Seeing how much sentiment changed in consecutive days (in percentage)
tweet2020_sentiment = tweet2020_sentiment.reset_index(drop=True)
tweet2020_sentiment['Sentiment_Change']=0
for i in range(len(tweet2020_sentiment)-1):
    tweet2020_sentiment['Sentiment_Change'][i]=((tweet2020_sentiment['VADER_score'][i+1] - tweet2020_sentiment['VADER_score'][i])/tweet2020_sentiment['VADER_score'][i])*100

In [232]:
tweet2020_sentiment.head()

Unnamed: 0,Date,VADER_score,Sentiment_Change
0,2020-10-01,0.245195,-9
1,2020-10-02,0.221654,-56
2,2020-10-03,0.095984,319
3,2020-10-04,0.402757,-30
4,2020-10-05,0.280624,-49


In [233]:
# Merging both datasets
ada_tweets = pd.merge(tweet2020_sentiment,ada,on='Date',how='left')
ada_tweets.head(10)

Unnamed: 0,Date,VADER_score,Sentiment_Change,Open,High,Low,Close,Adj Close,Volume,Price_Change,Change
0,2020-10-01,0.245195,-9,0.101202,0.104311,0.096007,0.097878,0.097878,499813017.0,-5.0,Decrease
1,2020-10-02,0.221654,-56,0.097894,0.099826,0.090471,0.092913,0.092913,602540074.0,0.0,Neutral
2,2020-10-03,0.095984,319,0.092913,0.095158,0.092635,0.093684,0.093684,334059453.0,2.0,Neutral
3,2020-10-04,0.402757,-30,0.093684,0.096791,0.092884,0.096301,0.096301,482409858.0,1.0,Neutral
4,2020-10-05,0.280624,-49,0.096301,0.098844,0.096255,0.097687,0.097687,432817916.0,-4.0,Neutral
5,2020-10-06,0.141001,60,0.097634,0.098548,0.092168,0.092909,0.092909,783688998.0,1.0,Neutral
6,2020-10-07,0.225863,32,0.092909,0.094394,0.090105,0.093936,0.093936,420412930.0,2.0,Neutral
7,2020-10-08,0.29956,-31,0.093944,0.096776,0.090903,0.096242,0.096242,457557891.0,9.0,Increase
8,2020-10-09,0.204172,21,,,,,,,,
9,2020-10-10,0.248232,-35,0.101466,0.11054,0.101445,0.105025,0.105025,923715999.0,1.0,Neutral


## Plots

In [234]:
ada_tweets_copy=ada_tweets[ada_tweets['Date']>='2020-10-01']
ada_tweets_copy.iloc[91,9] = 0

In [235]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Bar(x=ada_tweets_copy['Date'], y=ada_tweets_copy['Price_Change'], name="Price Change"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=ada_tweets_copy['Date'], y=ada_tweets_copy['VADER_score'], name="Sentiment Score"),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Price_Change vs Sentiment Score for Twitter Data"
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="<b>Price Change</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>Sentiment Score</b>", secondary_y=True)

fig.show()