# Sample Data

### Bitcoin

In [12]:
# Define period
import datetime
startdate = (datetime.datetime.now() - datetime.timedelta(days=20))
starttime =startdate.strftime("%Y-%m-%d")

enddate = datetime.datetime.now()
endtime = enddate.strftime("%Y-%m-%d")

In [None]:
# Extract bitcoin prices
from urllib.request import urlopen
import requests
html = requests.get("https://api.coindesk.com/v1/bpi/historical/close.json?start="+starttime+"&end="+endtime)
bitcoin = html.json()

# Dictionary {Date : Bitcoin price}
d_bitcoin = bitcoin["bpi"]

import pandas as pd
# Convert to DataFrame
df_bitcoin = pd.DataFrame({'date': list(d_bitcoin.keys()),'bitcoin': list(d_bitcoin.values())})

## News

In [17]:
from datetime import date, timedelta
delta = enddate - startdate    # as timedelta
dates = [(startdate + timedelta(days=i)).strftime("%Y-%m-%d") for i in range(delta.days + 1)]

In [18]:
import statistics
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer();

mean_scores = [] # Mean scores per day
for date in dates:
    # Only the English news about bitcoin
    news = requests.get("https://newsapi.org/v2/everything?q=bitcoin&from="+date+"&to="+date+"&sortBy=publishedAt&language=en&apiKey=b3aae8a2a9f34d82ae214aea72d0f2b6").json()
    compound = [] # Sentiment scores of each article on specific day
    for i in range(0,len(news["articles"])):
        text = news["articles"][i]["title"]
        compound.append(sia.polarity_scores(text)['compound'])
    mean_scores.append(statistics.mean(compound))

In [23]:
# DataFrame with date and mean scores
sentiment_scores = pd.DataFrame(
    {'date': dates,
     'sentiment': mean_scores
    })

# Spark

In [20]:
from pyspark.sql import SparkSession
# Create PySpark SparkSession
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("project") \
    .getOrCreate()

In [21]:
# Create PySpark DataFrame from Pandas for bitcoin
spark_bitcoin=spark.createDataFrame(df_bitcoin) 
spark_bitcoin.printSchema()
spark_bitcoin.show()

root
 |-- date: string (nullable = true)
 |-- bitcoin: double (nullable = true)

+----------+----------+
|      date|   bitcoin|
+----------+----------+
|2021-10-02|47671.9367|
|2021-10-03|48233.0983|
|2021-10-04|  49240.26|
|2021-10-05|  51495.32|
|2021-10-06| 55340.015|
|2021-10-07| 53798.405|
|2021-10-08|53944.2767|
|2021-10-09|54960.8483|
|2021-10-10|54686.1333|
|2021-10-11|57490.3067|
|2021-10-12| 56001.745|
|2021-10-13|57375.7267|
|2021-10-14|57355.0283|
|2021-10-15|  61693.79|
|2021-10-16|60871.4467|
|2021-10-17|61518.8433|
|2021-10-18|62025.1667|
|2021-10-19|64286.5167|
|2021-10-20|66015.0967|
|2021-10-21|62201.9633|
+----------+----------+



In [24]:
# Create PySpark DataFrame from Pandas for sentiment score
spark_sent=spark.createDataFrame(sentiment_scores) 
spark_sent.printSchema()
spark_sent.show()

root
 |-- date: string (nullable = true)
 |-- sentiment: double (nullable = true)

+----------+--------------------+
|      date|           sentiment|
+----------+--------------------+
|2021-10-02|             0.08999|
|2021-10-03|            0.025085|
|2021-10-04|0.012860000000000007|
|2021-10-05|0.008915000000000003|
|2021-10-06|             0.07488|
|2021-10-07|0.036399999999999995|
|2021-10-08|            0.161375|
|2021-10-09|             0.10668|
|2021-10-10|            0.093935|
|2021-10-11|-0.04367500000000...|
|2021-10-12|            0.101235|
|2021-10-13|-0.00575499999999...|
|2021-10-14|0.049144999999999994|
|2021-10-15|             0.09211|
|2021-10-16|0.026699999999999995|
|2021-10-17|            0.035655|
|2021-10-18|            -0.05428|
|2021-10-19|             0.03883|
|2021-10-20| 0.10020000000000001|
|2021-10-21|             0.07749|
+----------+--------------------+
only showing top 20 rows



In [28]:
# Join both on date column
sparkDF = spark_sent.join(spark_bitcoin,spark_sent.date ==  spark_bitcoin.date,"inner")

In [29]:
# Drop date column
sparkDF = sparkDF.drop('date')

In [30]:
sparkDF.show()

+--------------------+----------+
|           sentiment|   bitcoin|
+--------------------+----------+
|             0.03883|64286.5167|
|             0.10668|54960.8483|
|            0.093935|54686.1333|
|             0.09211|  61693.79|
|0.026699999999999995|60871.4467|
|0.012860000000000007|  49240.26|
|             0.07488| 55340.015|
|-0.00575499999999...|57375.7267|
|            0.025085|48233.0983|
|            -0.05428|62025.1667|
| 0.10020000000000001|66015.0967|
|            0.161375|53944.2767|
|            0.035655|61518.8433|
|0.008915000000000003|  51495.32|
|             0.07749|62201.9633|
|0.049144999999999994|57355.0283|
|             0.08999|47671.9367|
|-0.04367500000000...|57490.3067|
|0.036399999999999995| 53798.405|
|            0.101235| 56001.745|
+--------------------+----------+



# MLlib

In [31]:
from pyspark.ml.feature import VectorAssembler
# Define features
assembler = VectorAssembler().setInputCols(['sentiment']).setOutputCol('features')
DF = assembler.transform(sparkDF)

In [34]:
# Split data into train/test
splits = DF.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [35]:
from pyspark.ml.regression import LinearRegression
# Train regression model
lr = LinearRegression(featuresCol = 'features', labelCol='bitcoin', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
#print("Coefficients: " + str(lr_model.coefficients))
#print("Intercept: " + str(lr_model.intercept))

In [36]:
# Apply regression model on test data
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","bitcoin","features").show()

+------------------+----------+--------------------+
|        prediction|   bitcoin|            features|
+------------------+----------+--------------------+
| 56449.77608017796|54960.8483|           [0.10668]|
| 56779.21874821476|54686.1333|          [0.093935]|
| 56826.39276698032|  61693.79|           [0.09211]|
| 58517.16129709307|60871.4467|[0.02669999999999...|
| 58874.90837638921|  49240.26|[0.01286000000000...|
|   59356.083367798|57375.7267|[-0.0057549999999...|
|58558.907072603426|48233.0983|          [0.025085]|
|58976.881803255055|  51495.32|[0.00891500000000...|
| 57204.30189265298|62201.9633|           [0.07749]|
| 56590.52267315249| 56001.745|          [0.101235]|
+------------------+----------+--------------------+

