# Ejercicio (pruebas con PySpark)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import explode, split, col, regexp_replace, size, length, lower
import requests

In [2]:
spark = SparkSession.builder.appName("Analaislibro").getOrCreate()
url = "https://www.gutenberg.org/cache/epub/28885/pg28885.txt"

In [3]:
response =requests.get(url)
libro_texto = response.text

In [4]:
with open("libro.txt", "w", encoding="utf-8") as file:
    file.write(libro_texto)

In [5]:
rdd = spark.sparkContext.textFile("libro.txt")
rdd = rdd.filter(lambda line: line.strip() !="")

In [6]:
capitulos = rdd.filter(lambda line: line.strip().startswith("CHAPTER")).collect()
parrafos = rdd.flatMap(lambda line: line.split("\n\n"))
df_parrafos = parrafos.map(lambda p: (p,)).toDF(["parrafo"])
df_parrafos.show()

+--------------------+
|             parrafo|
+--------------------+
|The Project Guten...|
|This ebook is for...|
|most other parts ...|
|whatsoever. You m...|
|of the Project Gu...|
|at www.gutenberg....|
|you will have to ...|
|before using this...|
|Title: Alice's Ad...|
|Author: Lewis Car...|
|Contributor: Aust...|
|Illustrator: Arth...|
|Release date: May...|
|                M...|
|   Language: English|
|Credits: Produced...|
|        Proofread...|
|        produced ...|
|        Universit...|
|*** START OF THE ...|
+--------------------+
only showing top 20 rows



In [7]:
df_parrafos = df_parrafos.withColumn("parrafo_limpio", regexp_replace(col("parrafo"), "[^a-zA-Z\s]", "").alias("parrafo_limpio"))
df_parrafos = df_parrafos.withColumn("parrafo_limpio", regexp_replace(col("parrafo_limpio"), "\s+", " ").alias("parrafo_limpio"))

In [8]:
df_parrafos = df_parrafos.withColumn("oraciones", split(col("parrafo_limpio"), "\."))
df_parrafos = df_parrafos.withColumn("num_oraciones", size(col("oraciones")))
df_parrafos = df_parrafos.withColumn("palabras", split(col("parrafo_limpio"), " "))
df_parrafos = df_parrafos.withColumn("num_palabras", size(col("palabras")))

In [9]:
df_parrafos.show()

+--------------------+--------------------+--------------------+-------------+--------------------+------------+
|             parrafo|      parrafo_limpio|           oraciones|num_oraciones|            palabras|num_palabras|
+--------------------+--------------------+--------------------+-------------+--------------------+------------+
|The Project Guten...|The Project Guten...|[The Project Gute...|            1|[The, Project, Gu...|           9|
|This ebook is for...|This ebook is for...|[This ebook is fo...|            1|[This, ebook, is,...|          14|
|most other parts ...|most other parts ...|[most other parts...|            1|[most, other, par...|          14|
|whatsoever. You m...|whatsoever You ma...|[whatsoever You m...|            1|[whatsoever, You,...|          14|
|of the Project Gu...|of the Project Gu...|[of the Project G...|            1|[of, the, Project...|          11|
|at www.gutenberg....|at wwwgutenbergor...|[at wwwgutenbergo...|            1|[at, wwwgutenberg.

# Ejercicio (probar con nuevo libro)

## Descarga del texto del libro desde Project Gutenberg

In [10]:
url = "https://www.gutenberg.org/cache/epub/4280/pg4280.txt"
response = requests.get(url)
libro_texto = response.text

In [11]:
# Guarda el contenido del libro en un archivo local
with open("libro.txt", "w", encoding="utf-8") as file:
    file.write(libro_texto)

In [12]:
# Lectura del archivo con Spark
rdd = spark.sparkContext.textFile("libro.txt")

In [13]:
# Filtra líneas vacías
rdd = rdd.filter(lambda line: line.strip() != "")

In [14]:
# Detectar secciones en mayúsculas o con encabezados "CHAPTER"
capitulos = rdd.filter(lambda line: line.strip().upper().startswith("CHAPTER") or line.isupper()).collect()

In [15]:
# Dividir el texto en párrafos
parrafos = rdd.flatMap(lambda line: line.split("\n\n"))

In [16]:
# Crear DataFrame de párrafos
parrafos_rdd = parrafos.map(lambda p: Row(parrafo=p))
df_parrafos = spark.createDataFrame(parrafos_rdd)
df_parrafos.show()

+--------------------+
|             parrafo|
+--------------------+
|The Project Guten...|
|This ebook is for...|
|most other parts ...|
|whatsoever. You m...|
|of the Project Gu...|
|at www.gutenberg....|
|you will have to ...|
|before using this...|
|Title: The Critiq...|
|Author: Immanuel ...|
|Translator: J. M....|
|Release date: Jul...|
|                M...|
|   Language: English|
|Credits: Charles ...|
|*** START OF THE ...|
|      [Illustration]|
|The Critique of P...|
|    By Immanuel Kant|
|Translated by J. ...|
+--------------------+
only showing top 20 rows



In [17]:
# Eliminar caracteres no alfabéticos
df_parrafos = df_parrafos.withColumn(
    "parrafo_limpio", regexp_replace(col("parrafo"), "[^a-zA-Z\s]", "")
)

In [18]:
# Normalizar espacios en blanco
df_parrafos = df_parrafos.withColumn(
    "parrafo_limpio", regexp_replace(col("parrafo_limpio"), "\s+", " ")
)

In [19]:
# Dividir en oraciones y palabras, contar su número
df_parrafos = df_parrafos.withColumn("oraciones", split(col("parrafo_limpio"), "\\."))
df_parrafos = df_parrafos.withColumn("num_oraciones", size(col("oraciones")))
df_parrafos = df_parrafos.withColumn("palabras", split(col("parrafo_limpio"), " "))
df_parrafos = df_parrafos.withColumn("num_palabras", size(col("palabras")))

In [20]:
# Agregar longitud de caracteres
df_parrafos = df_parrafos.withColumn("longitud_caracteres", length(col("parrafo_limpio")))

In [21]:
# Mostrar los resultados finales
df_parrafos.show()

+--------------------+--------------------+--------------------+-------------+--------------------+------------+-------------------+
|             parrafo|      parrafo_limpio|           oraciones|num_oraciones|            palabras|num_palabras|longitud_caracteres|
+--------------------+--------------------+--------------------+-------------+--------------------+------------+-------------------+
|The Project Guten...|The Project Guten...|[The Project Gute...|            1|[The, Project, Gu...|          10|                 58|
|This ebook is for...|This ebook is for...|[This ebook is fo...|            1|[This, ebook, is,...|          14|                 69|
|most other parts ...|most other parts ...|[most other parts...|            1|[most, other, par...|          14|                 72|
|whatsoever. You m...|whatsoever You ma...|[whatsoever You m...|            1|[whatsoever, You,...|          14|                 67|
|of the Project Gu...|of the Project Gu...|[of the Project G...|     

In [22]:
# Contar las palabras más frecuentes en el texto
df_palabras = df_parrafos.withColumn("palabra", explode(split(lower(col("parrafo_limpio")), " ")))
df_palabras = df_palabras.groupBy("palabra").count().orderBy("count", ascending=False)

In [23]:
# Mostrar las 20 palabras más comunes
df_palabras.show(20)

+-------+-----+
|palabra|count|
+-------+-----+
|    the|16131|
|     of|13353|
|     to| 6279|
|     in| 5977|
|     is| 4852|
|    and| 4836|
|      a| 4405|
|  which| 3365|
|   that| 3039|
|     it| 2914|
|     as| 2839|
|     be| 2380|
|   this| 2261|
|    not| 1931|
|     we| 1915|
|    but| 1759|
|    for| 1679|
|    all| 1424|
|     by| 1390|
|     an| 1381|
+-------+-----+
only showing top 20 rows

