<a href="https://colab.research.google.com/github/Chienlovecode/Apple_stocks_predict/blob/main/Apple_Predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#1. Cài đặt thư viện
import pandas as pd # Import pandas and give it alias pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf

from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col, avg, lag, when
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_absolute_error, r2_score

# #2. Khởi tạo SparkSession
spark = SparkSession.builder \
    .appName("StockLSTM_PySpark_AAPL") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()


#Download AAPL data via yfinance and load into Spark

In [None]:
# Bước 1: Tạo SparkSession
START = "2015-01-01"
TODAY = pd.to_datetime("today").strftime("%Y-%m-%d")

# Download into pandas
pdf = yf.download('AAPL', START, TODAY).reset_index()

# Check if the pandas DataFrame is empty
if pdf.empty:
    raise ValueError("The downloaded data is empty. Check your date range or internet connection.")

# Flatten MultiIndex columns (nếu có)
if isinstance(pdf.columns, pd.MultiIndex):
    # Lấy level 0 (Open, High, Low, Close, Adj Close, Volume, Date)
    pdf.columns = pdf.columns.get_level_values(0)

# Tạo Spark DataFrame từ Pandas
df_spark = spark.createDataFrame(pdf)

# Set legacy time parser policy
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY") # This line is added to set the legacy time parser

# Chuyển cột 'Date' về kiểu ngày và sắp xếp
from pyspark.sql.functions import to_date, col
df_spark = (
    df_spark
    .withColumn("Date", to_date(col("Date").cast("string"), "yyyy-MM-dd"))
    .orderBy("Date")
)

df_spark.printSchema()
df_spark.show(5)

#Feature Engineering in Spark

In [None]:
# Moving Average (MA20)
w = Window().orderBy("Date").rowsBetween(-19, 0)
df_spark = df_spark.withColumn("MA20", avg("Close").over(w))

# RSI calculation
df_spark = df_spark.withColumn("delta", col("Close") - lag("Close",1).over(Window.orderBy("Date")))
df_spark = df_spark.withColumn("gain", when(col("delta")>0, col("delta")).otherwise(0))
df_spark = df_spark.withColumn("loss", when(col("delta")<0, -col("delta")).otherwise(0))

w14 = Window().orderBy("Date").rowsBetween(-13, 0)
df_spark = df_spark.withColumn("avg_gain", avg("gain").over(w14)) \
                   .withColumn("avg_loss", avg("loss").over(w14))
df_spark = df_spark.withColumn("RS", col("avg_gain")/col("avg_loss")) \
                   .withColumn("RSI", 100 - (100/(1+col("RS"))))

df_spark.select("Date", "Close", "MA20", "RSI").show(5)

#Scaling features with Spark ML

In [None]:
from pyspark.sql.functions import isnan, when, count, col

# Check for nulls in relevant columns
for column in ["Open","High","Low","Close","Volume","MA20","RSI"]:
    null_count = df_spark.select(count(when(isnan(column) | col(column).isNull(), column))).first()[0]
    print(f"Number of nulls in column {column}: {null_count}")

# Drop rows with nulls in any of the relevant columns
df_spark = df_spark.dropna(subset=["Open","High","Low","Close","Volume","MA20","RSI"])

assembler = VectorAssembler(
    inputCols=["Open","High","Low","Close","Volume","MA20","RSI"],
    outputCol="features_raw"
)
df_vec = assembler.transform(df_spark)

scaler = MinMaxScaler(inputCol="features_raw", outputCol="features_scaled")
scaler_model = scaler.fit(df_vec)
df_scaled = scaler_model.transform(df_vec)

df_scaled.select("Date", "features_scaled").show(5)


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Lỗi tải dữ liệu: Empty dataset received.. Thử lại sau 30 giây... (1/3)


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Lỗi tải dữ liệu: Empty dataset received.. Thử lại sau 30 giây... (2/3)


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Lỗi tải dữ liệu: Empty dataset received.. Thử lại sau 30 giây... (3/3)
Thất bại sau nhiều lần thử. Vui lòng thử lại sau.


In [None]:
# Xoá các cột 'Date' và 'Adj Close'
df_spark = df_spark.drop('Date', 'Adj Close')

# Lấy dữ liệu cột 'Close' về pandas để vẽ
df_plot = df_spark.select("Close").toPandas()

# Vẽ biểu đồ bằng matplotlib
plt.title("Close Price Visualization")
plt.plot(df_plot['Close'])
plt.xlabel("Time (Index)")
plt.ylabel("Close Price")
plt.grid(True)
plt.show()


NameError: name 'df_spark' is not defined