In [None]:
import os
from pyspark.sql import SparkSession

# obtener ip del driver
IP = !hostname -I | cut -d' ' -f1
IP = IP[0]

# crear la sesión spark
APP_NAME = f"Ejercicio10_SumSquares_{os.environ['JUPYTERHUB_USER']}"
SPARK_URL = "spark://spark-master.spark.svc.cluster.local:7077"

spark = (
    SparkSession.builder
    .appName(APP_NAME)
    .master(SPARK_URL)
    .config("spark.driver.host", IP)
    .config("spark.executor.memory", "512M")
    .config("spark.cores.max", 2)
    .config("spark.executor.instances", 2)
    .getOrCreate()
)

sc = spark.sparkContext

print("Conectado correctamente a Spark.")
print("Master:", SPARK_URL)
print("App:", APP_NAME)
print("Driver IP:", IP)
print("Paralelismo por defecto:", sc.defaultParallelism)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/12 22:14:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Conectado correctamente a Spark.
Master: spark://spark-master.spark.svc.cluster.local:7077
App: Ejercicio10_SumSquares_cap1462_09
Driver IP: 10.42.10.151
Paralelismo por defecto: 2


In [2]:
import time

N = 1_000_000  # tamaño del problema
rdd = sc.range(0, N, numSlices=sc.defaultParallelism).map(
    lambda i: ((i * 1103515245 + 12345) % 1_000_000) / 1_000_000.0
)

t0 = time.time()
resultado = rdd.map(lambda x: x * x).sum()
elapsed = time.time() - t0

print(f"N={N:,}")
print(f"Resultado = {resultado:.6f}")
print(f"Tiempo total = {elapsed:.3f} s")




N=1,000,000
Resultado = 333330.833337
Tiempo total = 1.863 s


                                                                                

In [3]:
import time

def bench_sizes(N_list):
    results = []
    for N in N_list:
        rdd = sc.range(0, N, numSlices=sc.defaultParallelism).map(
            lambda i: ((i * 1103515245 + 12345) % 1_000_000) / 1_000_000.0
        )
        t0 = time.time()
        res = rdd.map(lambda x: x*x).sum()
        t1 = time.time()
        dt = t1 - t0
        results.append((N, res, dt))
        print(f"N={N:,}  sumsq={res:.6f}  elapsed={dt:.3f}s")
    return results

results_spark_sizes = bench_sizes([100_000, 1_000_000, 10_000_000])


N=100,000  sumsq=33333.702524  elapsed=0.195s
N=1,000,000  sumsq=333330.833337  elapsed=0.471s




N=10,000,000  sumsq=3333308.333375  elapsed=3.185s


                                                                                

In [4]:
spark.stop()


In [5]:
# re-crear con 1 core
import os
from pyspark.sql import SparkSession

IP = !hostname -I | cut -d' ' -f1
IP = IP[0]

spark = (SparkSession.builder
         .appName(f"Ej10_c=1_{os.environ['JUPYTERHUB_USER']}")
         .master("spark://spark-master.spark.svc.cluster.local:7077")
         .config("spark.driver.host", IP)
         .config("spark.executor.memory", "512M")
         .config("spark.cores.max", 1)   # ← c = 1
         .config("spark.executor.instances", 1)
         .getOrCreate())
sc = spark.sparkContext
print("cores max =", spark.sparkContext.getConf().get('spark.cores.max'))

cores max = 1


In [6]:
import time
N = 10_000_000
rdd = sc.range(0, N, numSlices=sc.defaultParallelism).map(
    lambda i: ((i * 1103515245 + 12345) % 1_000_000) / 1_000_000.0
)
t0 = time.time(); res1 = rdd.map(lambda x: x*x).sum(); t1 = time.time()
t_c1 = t1 - t0
print(f"c=1 → elapsed={t_c1:.3f}s  sumsq={res1:.6f}")




c=1 → elapsed=7.527s  sumsq=3333308.333375


                                                                                

In [7]:
spark.stop()


In [8]:

from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName(f"Ej10_c=2_{os.environ['JUPYTERHUB_USER']}")
         .master("spark://spark-master.spark.svc.cluster.local:7077")
         .config("spark.driver.host", IP)
         .config("spark.executor.memory", "512M")
         .config("spark.cores.max", 2)   # ← c = 2
         .config("spark.executor.instances", 2)
         .getOrCreate())
sc = spark.sparkContext

import time
rdd = sc.range(0, N, numSlices=sc.defaultParallelism).map(
    lambda i: ((i * 1103515245 + 12345) % 1_000_000) / 1_000_000.0
)
t0 = time.time(); res2 = rdd.map(lambda x: x*x).sum(); t1 = time.time()
t_c2 = t1 - t0
print(f"c=2 → elapsed={t_c2:.3f}s  sumsq={res2:.6f}")




c=2 → elapsed=6.846s  sumsq=3333308.333375


                                                                                

In [9]:
S = t_c1 / t_c2
E = S / 2
print(f"Speedup S(2)={S:.2f}  |  Eficiencia E(2)={E:.2f}")


Speedup S(2)=1.10  |  Eficiencia E(2)=0.55
