In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.types as t
import pyspark.sql.functions as f
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd

SEED_VALUE = 42

In [2]:
spark = (SparkSession.builder.config("spark.driver.memory","5g").config("spark.driver.maxResultSize", "5g").getOrCreate())
spark

In [None]:
%%time
from pyspark.sql.types import DoubleType


X_train = spark.read.csv("./Dataset/train.csv", header=True, inferSchema=True)
X_test = spark.read.csv("./Dataset/example_test.csv", header=True, inferSchema=True)

# Convert VWAP Feature to double
X_train = X_train.withColumn("VWAP", X_train.VWAP.cast(DoubleType()))

X_train.printSchema()

In [None]:
X_train.head(1)

In [None]:
%%time
# Check null values if any

X_train.select([f.count(f.when(f.isnull(c), c)).alias(c) for c in X_train.columns]).show()

In [None]:
# Drop missing values
X_train = X_train.na.drop()

In [None]:
%%time
# Check null values if any

X_train.select([f.count(f.when(f.isnull(c), c)).alias(c) for c in X_train.columns]).show()

In [None]:
col_features = ['Count', "Open", "High", "Low", "Close", "Volume"]

In [None]:
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=col_features, outputCol="features")
output = vecAssembler.transform(X_train).select('features', 'Target')

output.select('features').show()

In [None]:
# Base Model (Without Time series considerations)
from pyspark.ml.regression import LinearRegression
train_data, test_data = output.randomSplit([0.75, 0.25]) # Randomly split the data into training and testing

lr = LinearRegression(featuresCol="features", labelCol = "Target")

lrModel = lr.fit(train_data) # Fit the model

In [None]:
lrModel.coefficients

In [None]:
lrModel.intercept

In [15]:
%%time
# Predict/Evaluate on test data

pred_results = lrModel.evaluate(test_data)
pred_results.predictions.show()



+--------------------+--------------------+--------------------+
|            features|              Target|          prediction|
+--------------------+--------------------+--------------------+
|[1.0,0.0251,0.025...|7.785829789781484E-4|-3.23497948120739...|
|[1.0,0.030485,0.0...|-0.01314475873544...|-3.51733445792442...|
|[1.0,0.03055,0.03...|0.007245901639344243|-3.50072249715613...|
|[1.0,0.0309,0.030...|0.013009708737864223|-3.42106101244415...|
|[1.0,0.0310170000...|0.001958224543081144|-3.20289515509599...|
|[1.0,0.03102,0.03...|-0.02023690410870...|-3.46525996110049...|
|[1.0,0.03184,0.03...|0.026107694238734824|-3.50332191630705...|
|[1.0,0.03193,0.03...|-0.03343749999999979|-3.40981882241790...|
|[1.0,0.032,0.032,...|-0.03998124413879...|-3.50583064507680...|
|[1.0,0.032234,0.0...|0.010310555159922075|-3.47381560174487...|
|[1.0,0.0322470000...|-0.01086517865458...|-3.22566142574302...|
|[1.0,0.0323100000...|-0.00675925925925...|-3.49029660570717...|
|[1.0,0.03258,0.03...|-6.

In [16]:
%%time

X_train = pd.read_csv("./Dataset/train.csv")
X_train.head()

CPU times: user 24.5 s, sys: 3.4 s, total: 27.9 s
Wall time: 28.2 s


Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [17]:
X_train.dropna(inplace=True)
X_train.describe()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
count,23486460.0,23486460.0,23486460.0,23486460.0,23486460.0,23486460.0,23486460.0,23486460.0,23486460.0,23486460.0
mean,1577541000.0,6.224257,295.4028,1476.051,1479.876,1472.882,1476.05,294731.7,,7.121863e-06
std,33333670.0,4.099476,879.6244,6119.951,6129.966,6110.473,6119.956,2471421.0,,0.005679042
min,1514765000.0,0.0,1.0,0.0011704,0.001195,0.0002,0.0011714,-0.3662812,-inf,-0.5093509
25%,1549306000.0,3.0,22.0,0.280867,0.2819,0.28,0.2808736,149.3896,0.2808644,-0.001694353
50%,1578971000.0,6.0,69.0,14.86842,14.89,14.844,14.86866,1332.36,14.8672,-4.28982e-05
75%,1606979000.0,9.0,231.0,234.965,235.38,234.52,234.9686,29115.12,234.9653,0.001601519
max,1632181000.0,13.0,165016.0,64805.94,64900.0,64670.53,64808.54,759755400.0,inf,0.9641699


In [18]:
y_train = X_train['Target']
X_train = X_train.drop(["Target", "VWAP"], axis=1)

In [19]:
from sklearn.model_selection import train_test_split
X_train_1, X_test, y_train_1, y_test = train_test_split(X_train, y_train, random_state=SEED_VALUE, train_size=0.80)

In [1]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

fitted_lr_model = lr_model.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [None]:
preds = fitted_lr_model.predict(X_test)
preds

In [None]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, preds)
mae