# Análisis sobre el efecto de los tuits (y otros factores) en el precio del Bitcoin y otras criptomonedas.

In [372]:
# Importo pandas para trabajar con dataframes, y numpy para posibles operaciones numéricas con los datos.

import pandas as pd
import numpy as np

In [373]:
dataset = pd.read_csv("Data_To_Hourervals_no_filter.csv", sep=';')

url_dataset = "https://www.kaggle.com/jaimebadiola/bitcoin-tweets-and-price/version/1"

In [374]:
dataset.head()

Unnamed: 0,Date,Compound_Score,n,Count_Negatives,Count_Positives,Count_Neutrals,Sent_Negatives,Sent_Positives,Open,High,Low,Close,Volume (BTC),Volume (Currency)
0,01/08/2017 0:00,0.097156,1027.0,148.0,403.0,476.0,-0.504061,0.432704,2855.81,2863.06,2823.0,2825.92,184.02,52295100.0
1,01/08/2017 1:00,0.064507,778.0,143.0,239.0,396.0,-0.381007,0.437953,2823.01,2860.02,2821.01,2853.38,77.3,219605.16
2,01/08/2017 2:00,0.119218,836.0,118.0,333.0,385.0,-0.394999,0.439269,2846.27,2858.04,2837.31,2841.6,135.83,386739.15
3,01/08/2017 3:00,0.004163,984.0,262.0,279.0,443.0,-0.431913,0.420278,2841.84,2863.88,2837.73,2862.93,143.2,408360.03
4,01/08/2017 4:00,0.065608,751.0,133.0,237.0,381.0,-0.405835,0.435645,2862.92,2876.0,2848.11,2874.99,222.53,637045.88


In [375]:
# Muestro los nombres de las columnas de mi dataset, para ver si me puedo deshacer de alguna que no 
# tenga relevancia para mi análisis.

print(dataset.columns.values)

['Date' 'Compound_Score' 'n' 'Count_Negatives' 'Count_Positives'
 'Count_Neutrals' 'Sent_Negatives' 'Sent_Positives' 'Open' 'High' 'Low'
 'Close' 'Volume (BTC)' 'Volume (Currency)']


### Voy a comparar el número de tuits negativos sobre bitcoin con su precio de cierre del día siguiente.

In [376]:
# Me interesan Date, Count_Negatives, Close, y la columna que tenga menos duplicates para guardarla hasta el 
# final. # Miro a ver cuál es:

def less_duplicates(dataset):
    print("Dataset rows:", len(dataset.index))
    for a in dataset:
        values = []
        for v in dataset["{}".format(a)]:
            values.append(v)
        print("{}:".format(a), len(set(values)))

print(less_duplicates(dataset))

Dataset rows: 12936
Date: 12936
Compound_Score: 12934
n: 2913
Count_Negatives: 1152
Count_Positives: 1585
Count_Neutrals: 1755
Sent_Negatives: 12934
Sent_Positives: 12936
Open: 12676
High: 12322
Low: 12426
Close: 12674
Volume (BTC): 12876
Volume (Currency): 4591
None


In [377]:
# Me sirve Date. Por tanto, me deshago de todas las columnas que no necesito:

mis_columnas = ["Date", "Count_Negatives", "Close"]

for columna in dataset.columns.values:
    if columna not in mis_columnas:
        dataset = dataset.drop(columna, axis = 1)

In [378]:
dataset.head()

Unnamed: 0,Date,Count_Negatives,Close
0,01/08/2017 0:00,148.0,2825.92
1,01/08/2017 1:00,143.0,2853.38
2,01/08/2017 2:00,118.0,2841.6
3,01/08/2017 3:00,262.0,2862.93
4,01/08/2017 4:00,133.0,2874.99


In [379]:
# Compruebo a ver cuántos valores nulos tengo en cada columna:

null_cols = dataset.isnull().sum()

null_cols[null_cols > 0]

Count_Negatives    271
dtype: int64

In [380]:
# No me van a servir esos registros, así que me deshago de ellos:

dataset = dataset[dataset.Count_Negatives.notnull()]

In [381]:
null_cols = dataset.isnull().sum()

null_cols[null_cols > 0]

Series([], dtype: int64)

In [382]:
dataset.head()

Unnamed: 0,Date,Count_Negatives,Close
0,01/08/2017 0:00,148.0,2825.92
1,01/08/2017 1:00,143.0,2853.38
2,01/08/2017 2:00,118.0,2841.6
3,01/08/2017 3:00,262.0,2862.93
4,01/08/2017 4:00,133.0,2874.99


In [383]:
# Quiero los valores de los datos en datetime, para poder categorizarlos en franjas temporales:

dataset["Date"] = pd.to_datetime(dataset["Date"], dayfirst = True)

In [384]:
dataset["Date"] = dataset["Date"].dt.date

In [385]:
dataset.head()

Unnamed: 0,Date,Count_Negatives,Close
0,2017-08-01,148.0,2825.92
1,2017-08-01,143.0,2853.38
2,2017-08-01,118.0,2841.6
3,2017-08-01,262.0,2862.93
4,2017-08-01,133.0,2874.99


In [386]:
dataset = dataset.set_index("Date")

In [387]:
# Ahora agrupo por día:

dataset = dataset.groupby("Date").agg({"Count_Negatives": "sum", "Close": "mean"})

In [388]:
dataset.head()

Unnamed: 0_level_0,Count_Negatives,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-01,7100.0,2764.869167
2017-08-02,4397.0,2711.504737
2017-08-03,4888.0,2745.320417
2017-08-04,4195.0,2827.97375
2017-08-05,2897.0,3161.75875


In [389]:
# Ahora tengo que hacer binning. Voy a aplicarlo a mi columna count, posiblemente dividiéndola en 
# Few, Medium, Many.

# Voy a estudiar los valores:

print("Minimum value:", dataset.Count_Negatives.min())
print("Maximum value:", dataset.Count_Negatives.max())
print("Average:", dataset.Count_Negatives.mean())
print("Meadian:", dataset.Count_Negatives.median())
print("Quantiles:", dataset.Count_Negatives.quantile([0.33, 0.66]))

Minimum value: 3.0
Maximum value: 32918.0
Average: 5882.769944341373
Meadian: 4931.0
Quantiles: 0.33    4176.54
0.66    6010.16
Name: Count_Negatives, dtype: float64


In [390]:
# Para hacer binning, voy a utilizar los quantiles.

Count_labels = ["Few", "Medium", "Many"]

bins = pd.qcut(dataset["Count_Negatives"], 3, labels = Count_labels)
dataset["Count_Negatives"] = bins

In [391]:
dataset.head()

Unnamed: 0_level_0,Count_Negatives,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-01,Many,2764.869167
2017-08-02,Medium,2711.504737
2017-08-03,Medium,2745.320417
2017-08-04,Medium,2827.97375
2017-08-05,Few,3161.75875


In [392]:
# Ahora voy a empezar a usar APIs para encontrar datos interesantes(precio bitcoin diario, precio ethereum diario, 
# precio oro diario, cambio-dolar bitcoin diario)

url = "https://api.coindesk.com/v1/bpi/historical/close.json?start=2017-01-08&end=2019-01-21"


In [393]:
import json
import requests

In [394]:
response = requests.get(url)
results = response.json()

In [395]:
dataset2 = pd.DataFrame(results)

In [396]:
dataset2.head()

Unnamed: 0,bpi,disclaimer,time
2017-01-08,913.5238,This data was produced from the CoinDesk Bitco...,
2017-01-09,899.35,This data was produced from the CoinDesk Bitco...,
2017-01-10,904.7925,This data was produced from the CoinDesk Bitco...,
2017-01-11,775.9813,This data was produced from the CoinDesk Bitco...,
2017-01-12,802.8288,This data was produced from the CoinDesk Bitco...,


In [397]:
dataset2 = dataset2.reset_index()

In [398]:
dataset2.head()

Unnamed: 0,index,bpi,disclaimer,time
0,2017-01-08,913.5238,This data was produced from the CoinDesk Bitco...,
1,2017-01-09,899.35,This data was produced from the CoinDesk Bitco...,
2,2017-01-10,904.7925,This data was produced from the CoinDesk Bitco...,
3,2017-01-11,775.9813,This data was produced from the CoinDesk Bitco...,
4,2017-01-12,802.8288,This data was produced from the CoinDesk Bitco...,


In [399]:
dataset2 = dataset2.rename(columns={"index": "Date"})

In [400]:
dataset2.head()

Unnamed: 0,Date2,bpi,disclaimer,time
0,2017-01-08,913.5238,This data was produced from the CoinDesk Bitco...,
1,2017-01-09,899.35,This data was produced from the CoinDesk Bitco...,
2,2017-01-10,904.7925,This data was produced from the CoinDesk Bitco...,
3,2017-01-11,775.9813,This data was produced from the CoinDesk Bitco...,
4,2017-01-12,802.8288,This data was produced from the CoinDesk Bitco...,


In [401]:
dataset2 = dataset2.drop(["disclaimer", "time"], axis = 1)

In [409]:
dataset2.head()

Unnamed: 0,Date2,bpi
0,2017-01-08,913.5238
1,2017-01-09,899.35
2,2017-01-10,904.7925
3,2017-01-11,775.9813
4,2017-01-12,802.8288


In [408]:
dataset.head()

Unnamed: 0_level_0,Count_Negatives,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-01,Many,2764.869167
2017-08-02,Medium,2711.504737
2017-08-03,Medium,2745.320417
2017-08-04,Medium,2827.97375
2017-08-05,Few,3161.75875


In [406]:
dataset3 = dataset.merge(dataset2, left_on='Date', right_on='Date2')

In [407]:
dataset3.head()

Unnamed: 0,Count_Negatives,Close,Date2,bpi


In [None]:
# Ahora quiero una columna Price_Change que muestre la diferencia entre el precio de cierre del día siguiente y 
# el de ese día, para ver cómo es el efecto de los tuits negativos de un día, sobre el precio del día siguiente.



In [139]:
# Ahora quiero que en cada registro me aparezca el valor de Close del día siguiente.
# Creo una lista de los valores de Close, incluyendo 0 como primer elemento y borrando el último valor

lista = [0]

lista += list(dataset["Close"])

lista.pop()
print(len(lista))
print(len(dataset["Close"]))

539
539


In [140]:
dataset["Close_Next_Day"] = lista

In [141]:
dataset.head()

Unnamed: 0_level_0,Count_Negatives,Close,Close_Next_Day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-08-01,7100.0,2764.869167,0.0
2017-08-02,4397.0,2711.504737,2764.869167
2017-08-03,4888.0,2745.320417,2711.504737
2017-08-04,4195.0,2827.97375,2745.320417
2017-08-05,2897.0,3161.75875,2827.97375


In [142]:
dataset = dataset[dataset.Close_Next_Day != 0]

In [143]:
dataset.head()

Unnamed: 0_level_0,Count_Negatives,Close,Close_Next_Day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-08-02,4397.0,2711.504737,2764.869167
2017-08-03,4888.0,2745.320417,2711.504737
2017-08-04,4195.0,2827.97375,2745.320417
2017-08-05,2897.0,3161.75875,2827.97375
2017-08-06,2795.0,3232.088333,3161.75875
