# ***Monte Carlo***

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=c947b67459267051d00624fce83299e9c0c92162c6b482252e19b419095e241a
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


Monte Carlo simulation is a computational technique used to understand the impact of risk and uncertainty in forecasting models. It involves using random sampling and probability distributions to model different possible outcomes of a process.*bold text*

In [2]:
import pyspark
import os
import sys
from pyspark import SparkContext
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
import numpy as np
import pyspark.sql.functions as fun
spark = SparkSession.builder.config("spark.driver.memory", "16g").appName('chapter_8').getOrCreate()

In [8]:
stocks = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/AAME.csv")

In [16]:
stocks.show(2)

+----------+----+-----------------+------------------+------------------+------------------+------+
|      Date|Open|             High|               Low|             Close|         Adj Close|Volume|
+----------+----+-----------------+------------------+------------------+------------------+------+
|1980-03-17| 0.0|4.050000190734863|3.8499999046325684|3.8499999046325684|3.2131667137145996| 15000|
|1980-03-18| 0.0|              4.0| 3.799999952316284| 3.799999952316284|3.1714370250701904| 10200|
+----------+----+-----------------+------------------+------------------+------------------+------+
only showing top 2 rows



*we generate mean and standard deviation from every numerical column in the dataset.*

In [11]:
parameters = {}
for column in stocks.columns:
    if column != "Date" and column != "Symbol":  # Exclude non-numeric columns
        mean_value = stocks.select(fun.mean(column)).collect()[0][0]
        std_dev_value = stocks.select(fun.stddev(column)).collect()[0][0]
        parameters[column] = {"mean": mean_value, "std_dev": std_dev_value}

# Print the calculated parameters
for column, values in parameters.items():
    print(f"Column: {column}, Mean: {values['mean']}, Standard Deviation: {values['std_dev']}")

Column: Open, Mean: 2.479037434833667, Standard Deviation: 1.792742719789236
Column: High, Mean: 3.6387547097464177, Standard Deviation: 2.3921509846515936
Column: Low, Mean: 3.512713408899274, Standard Deviation: 2.3491839152505065
Column: Close, Mean: 3.5762375610480097, Standard Deviation: 2.3617920735737727
Column: Adj Close, Mean: 3.245731631079554, Standard Deviation: 2.0282785173545963
Column: Volume, Mean: 7977.757971875619, Standard Deviation: 16239.129044271978


In [20]:
# Define the number of simulations
num_simulations = 1000

# Define input parameters with probability distributions
mean_open = 2.479037434833667
std_dev_open = 1.792742719789236
mean_close = 3.5762375610480097
std_dev_close = 2.3617920735737727
mean_high = 3.6387547097464177
std_dev_high = 2.3921509846515936
mean_low = 3.512713408899274
std_dev_low = 102.3491839152505065
mean_vol=7977.757971875619
std_dev_vol=16239.129044271978

# Define UDFs to generate random numbers from normal distributions
@udf(FloatType())
def generate_random_open():
    return np.random.normal(mean_open, std_dev_open)

@udf(FloatType())
def generate_random_close():
    return np.random.normal(mean_close, std_dev_close)

@udf(FloatType())
def generate_random_high():
    return np.random.normal(mean_high, std_dev_high)

@udf(FloatType())
def generate_random_low():
    return np.random.normal(mean_low, std_dev_low)

@udf(FloatType())
def generate_random_vol():
  return np.random.normal(mean_vol,std_dev_vol)

# Generate random numbers in parallel using Spark
simulated_prices_df = stocks.select(
    "*",
    generate_random_open().alias("simulated_open"),
    generate_random_close().alias("simulated_close"),
    generate_random_high().alias("simulated_high"),
    generate_random_low().alias("simulated_low"),
    generate_random_vol().alias("simulated_vol")
)

# Show the results
simulated_prices_df.show()



+----------+----+------------------+------------------+------------------+------------------+------+--------------+---------------+--------------+-------------+-------------+
|      Date|Open|              High|               Low|             Close|         Adj Close|Volume|simulated_open|simulated_close|simulated_high|simulated_low|simulated_vol|
+----------+----+------------------+------------------+------------------+------------------+------+--------------+---------------+--------------+-------------+-------------+
|1980-03-17| 0.0| 4.050000190734863|3.8499999046325684|3.8499999046325684|3.2131667137145996| 15000|     4.9653726|      5.6987395|      4.797388|    -92.38986|     24821.18|
|1980-03-18| 0.0|               4.0| 3.799999952316284| 3.799999952316284|3.1714370250701904| 10200|   -0.23643057|      2.8225355|       4.77557|    13.010745|    -7475.026|
|1980-03-19| 0.0| 4.050000190734863|3.8499999046325684|3.8499999046325684|3.2131667137145996| 33500|     1.2348555|      5.70