<a href="https://colab.research.google.com/github/huyhuy382003/Apple_stocks_predict/blob/main/Huy_Apple_Stock_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1) Install Java, Spark, Hadoop
!apt-get update -qq
!apt-get install -y openjdk-8-headless -qq
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar xf spark-3.3.2-bin-hadoop3.tgz

# 2) Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

# 3) Install Python packages
!pip install -q pyspark yfinance findspark

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
E: Unable to locate package openjdk-8-headless


In [3]:
# ─ Install Java and Spark ──────────────────────────────────────
!apt-get update -qq
!apt-get install -y openjdk-8-jdk-headless -qq

# Download and unpack Spark
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar xf spark-3.3.2-bin-hadoop3.tgz -C /usr/local
!mv /usr/local/spark-3.3.2-bin-hadoop3 /usr/local/spark

# ─ Set environment vars ────────────────────────────────────────
import os
os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/local/spark"

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package libxtst6:amd64.
(Reading database ... 126102 files and directories currently installed.)
Preparing to unpack .../libxtst6_2%3a1.2.3-1build4_amd64.deb ...
Unpacking libxtst6:amd64 (2:1.2.3-1build4) ...
Selecting previously unselected package openjdk-8-jre-headless:amd64.
Preparing to unpack .../openjdk-8-jre-headless_8u452-ga~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-8-jre-headless:amd64 (8u452-ga~us1-0ubuntu1~22.04) ...
Selecting previously unselected package openjdk-8-jdk-headless:amd64.
Preparing to unpack .../openjdk-8-jdk-headless_8u452-ga~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-8-jdk-headless:amd64 (8u452-ga~us1-0ubuntu1~22.04) ...
Setting up libxtst6:amd64 (2:1.2.3-1build4) ...
Setting up openjdk-8-jre-headless:amd64 (8u452-ga~us1-0ubu

In [1]:
import os
# (re-export in case you restarted runtime)
os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/local/spark"

import findspark
findspark.init()   # no args needed now that SPARK_HOME is set

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("AAPL_Prediction_Colab") \
    .getOrCreate()

print("✅ SparkSession started:", spark)

✅ SparkSession started: <pyspark.sql.session.SparkSession object at 0x7f1f239429d0>


In [5]:
# 1) Download Apple data in pandas
import yfinance as yf
pdf = yf.download("AAPL",
                  start="2018-01-01",
                  progress=False,
                  group_by="column",
                  auto_adjust=False).reset_index()
# Make sure columns are flat and only the ones we need:
import pandas as pd
if isinstance(pdf.columns, pd.MultiIndex):
    pdf.columns = pdf.columns.get_level_values(0)
pdf = pdf[["Date","Open","High","Low","Close","Volume"]]

# 2) Write it out to a local CSV
csv_path = "/content/aapl.csv"
pdf.to_csv(csv_path, index=False)

# 3) Let Spark read the CSV (no Python‐object pickling at all!)
from pyspark.sql.functions import to_date
df = (spark.read
           .option("header","true")
           .option("inferSchema","true")
           .csv(csv_path)
           .withColumn("Date", to_date("Date","yyyy-MM-dd"))
           .orderBy("Date"))

# 4) Now continue with your feature‐engineering + modeling
from pyspark.sql.functions import lag, avg
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

w = Window.orderBy("Date")
df = (df
      .withColumn("PrevClose", lag("Close",1).over(w))
      .withColumn("MA5",      avg("Close").over(w.rowsBetween(-5,-1)))
      .withColumn("MA10",     avg("Close").over(w.rowsBetween(-10,-1)))
      .na.drop())

assembler = VectorAssembler(
    inputCols=["Open","High","Low","Volume","PrevClose","MA5","MA10"],
    outputCol="features"
)
train_df, test_df = df.randomSplit([0.8,0.2], seed=42)

rf = RandomForestRegressor(
    labelCol="Close", featuresCol="features",
    numTrees=100, maxDepth=5, seed=42
)
model = Pipeline(stages=[assembler, rf]).fit(train_df)

preds = model.transform(test_df)
rmse = RegressionEvaluator(
    labelCol="Close", predictionCol="prediction",
    metricName="rmse"
).evaluate(preds)
print(f"AAPL Test RMSE: {rmse:.4f}")

preds.select("Date","Close","prediction") \
     .orderBy("Date") \
     .show(10,truncate=False)


AAPL Test RMSE: 3.1447
+----------+------------------+------------------+
|Date      |Close             |prediction        |
+----------+------------------+------------------+
|2018-01-05|43.75             |43.70016875504387 |
|2018-01-11|43.81999969482422 |43.731604847009194|
|2018-01-16|44.04750061035156 |43.987712890172396|
|2018-01-23|44.2599983215332  |43.987712890172396|
|2018-01-31|41.85749816894531 |40.10765680971549 |
|2018-02-06|40.75749969482422 |40.016806264201115|
|2018-02-14|41.842498779296875|40.016806264201115|
|2018-02-23|43.875            |43.9436841741815  |
|2018-03-09|44.994998931884766|43.987712890172396|
|2018-03-12|45.43000030517578 |43.987712890172396|
+----------+------------------+------------------+
only showing top 10 rows



In [6]:
from pyspark.ml.evaluation import RegressionEvaluator

# RMSE (you already have)
rmse_evaluator = RegressionEvaluator(
    labelCol="Close", predictionCol="prediction", metricName="rmse"
)
rmse = rmse_evaluator.evaluate(preds)

# MAE
mae_evaluator = RegressionEvaluator(
    labelCol="Close", predictionCol="prediction", metricName="mae"
)
mae = mae_evaluator.evaluate(preds)

# R²
r2_evaluator = RegressionEvaluator(
    labelCol="Close", predictionCol="prediction", metricName="r2"
)
r2 = r2_evaluator.evaluate(preds)

print(f"RMSE = {rmse:.4f}, MAE = {mae:.4f}, R² = {r2:.4f}")


RMSE = 3.1447, MAE = 2.0571, R² = 0.9973
