In [1]:
%%capture
!pip install pyspark

In [2]:
import pyspark.sql.functions as f

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.utils import AnalysisException

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#### Точка входа в сеанс Spark
1)Builder - абстракция шаблона builder для построения сеанса Spark, где мы объединяем методы в цепочку для настройки входа.
2)Лучше всегда указывать имя приложения  appName  
3)метод GetOrCreate() позволяет не создавать сессию если с таким именем уже имеется.

In [3]:
spark = (
    SparkSession
    .builder
    .appName("Analyzing the vocabulary of Pride and Prejudice.")
    .getOrCreate()
)

Вывод информации о sparkContext и о ф-ии

In [4]:
spark.sparkContext

In [5]:
#??spark.read
print(dir(spark.read))
print(spark.__doc__)

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_df', '_jreader', '_set_opts', '_spark', 'csv', 'format', 'jdbc', 'json', 'load', 'option', 'options', 'orc', 'parquet', 'schema', 'table', 'text']
The entry point to programming Spark with the Dataset and DataFrame API.

    A SparkSession can be used to create :class:`DataFrame`, register :class:`DataFrame` as
    tables, execute SQL over tables, cache tables, and read parquet files.
    To create a :class:`SparkSession`, use the following builder pattern:

    .. versionchanged:: 3.4.0
        Supports Spark Connect.

    .. autoattribute:: builder
       :annotation:

    Examples
    --------
    Create a Spark session.

    >>> spark = (
  

Уровень логирования

In [6]:
spark.sparkContext.setLogLevel("WARN")

#### Чтение данных  
Для чтения данных используется spark.read, можно указать разные форматы  
spark.read.csv() = spark.read.format('csv').load() лучше использовать 1 вариант

In [7]:
text = spark.read.text('/content/gdrive/MyDrive/1342-0.txt')

In [8]:
print(text.printSchema())
print(text.dtypes)

root
 |-- value: string (nullable = true)

None
[('value', 'string')]


In [9]:
text.show(n=5,truncate=500, vertical=True)

-RECORD 0---------------------------------------------------------------------
 value | The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen   
-RECORD 1---------------------------------------------------------------------
 value |                                                                      
-RECORD 2---------------------------------------------------------------------
 value | This eBook is for the use of anyone anywhere at no cost and with     
-RECORD 3---------------------------------------------------------------------
 value | almost no restrictions whatsoever.  You may copy it, give it away or 
-RECORD 4---------------------------------------------------------------------
 value | re-use it under the terms of the Project Gutenberg License included  
only showing top 5 rows



#### SQL func
alias() - метод для переименования столбцов  
select() - для выбора данных  
split() - метод для преобразования строки в массив слов  

In [11]:
lines = (
    text
    .select(
        f.split(f.col('value'),' ')
        .alias("line")
    )
)
lines.show(5, truncate=10)

+----------+
|      line|
+----------+
|[The, P...|
|        []|
|[This, ...|
|[almost...|
|[re-use...|
+----------+
only showing top 5 rows



4 способа выбора колонки с помощью select  
второй способ решает проблему странных названий столбцов  
в третьем способе не указываем, что столбец берется из фрейма данных text, что может быть полезно в сложных случаях  
четвертый короткий но не самый предпочитаемый

In [12]:
text.select(text.value)
text.select(text["value"])
text.select(f.col("value"))
text.select("value")

DataFrame[value: string]

если после select не использовать alias, то spark автоматически присвоит имя, которое можно изменить с помощью withColumnRenamed

In [13]:
lines = text.select(f.split(f.col('value'),' '))
lines.show()
lines = lines.withColumnRenamed('split(value,  , -1)','line')
lines.show()

+--------------------+
| split(value,  , -1)|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title:, Pride, a...|
|                  []|
|[Author:, Jane, A...|
|                  []|
|[Posting, Date:, ...|
|[Release, Date:, ...|
|[Last, Updated:, ...|
|                  []|
|[Language:, English]|
|                  []|
|[Character, set, ...|
|                  []|
+--------------------+
only showing top 20 rows

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title:, Pride, a...|
|                  []|
|[Author:, Jane, A...|
|                  []|
|[Posting, Date:, ...|
|[Release, Date:, ...|
|[Last, Updated:, ...|
|       

#### функция explode()  
При применении к столбцу, содержащему контейнерную структуру данных (например, массив), она будет принимать каждый элемент и присваивать ему свою собственную строку.

In [14]:
words = (
    lines
    .select(
        f.explode(f.col('line'))
        .alias('word')
    )
)
words.show()

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     EBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



#### функция lower  
меняет большие буквы на маленькие

In [17]:
words_lower = (
    words
    .select(
        f.lower(f.col('word'))
        .alias('word_lower')
    )
)
words_lower.show()

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
|prejudice,|
|        by|
|      jane|
|    austen|
|          |
|      this|
|     ebook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



#### ф-ия regexp_extract  
убираем знаки препинания с помощью ф-ии regexp_extract и регулярного выражения

In [31]:
words_clean = words_lower.select(
    f.regexp_extract(f.col("word_lower"), r'[a-z]+', 0)
    .alias("word").alias('word')
)
words_clean.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|         |
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
+---------+
only showing top 20 rows



#### ф-ия filter  
берем только те строки, которые соответствуют условию

In [32]:
words_nonull = words_clean.filter(f.col('word')!='')
words_nonull.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows



#### Задание  
посчитать сколько столбцов не являются строкой

In [34]:
exo2_2_df = spark.createDataFrame(
    [["test", "more test", 10_000_000_000]], ["one", "two", "three"]
)
exo2_2_df.printSchema()

no_str = 0
for column in exo2_2_df.columns:
    if exo2_2_df.select(f.col(column)).dtypes[0][1] != 'string':
      no_str += 1
print(no_str)

# изящное решение
print(len([x for x, y in exo2_2_df.dtypes if y != "string"]))

root
 |-- one: string (nullable = true)
 |-- two: string (nullable = true)
 |-- three: long (nullable = true)

1
1


не использовать withColumnRenamed

In [None]:
exo2_3_df = (
  spark.read.text('/content/gdrive/MyDrive/1342-0.txt')
  .select(f.length(f.col("value")))
  .withColumnRenamed("length(value)", "number_of_char")
)

In [None]:
exo2_3_df = (
  spark.read.text('/content/gdrive/MyDrive/1342-0.txt')
  .select(f.length(f.col("value")).alias('number_of_char'))
)

исправить ошибку

In [53]:
exo2_4_df = spark.createDataFrame([["key", 10_000, 20_000]], ["key", "value1", "value2"])
exo2_4_df.printSchema()

#greatest выведет максимальное имя колонки
try:
    exo2_4_mod = (
        exo2_4_df
        .select(
            f.greatest(f.col("value1"), f.col("value2"))
            .alias("maximum_value"),
        )
        .select("key", "max_value")
    )
except AnalysisException as err:
  print(err)

root
 |-- key: string (nullable = true)
 |-- value1: long (nullable = true)
 |-- value2: long (nullable = true)

[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `key` cannot be resolved. Did you mean one of the following? [`maximum_value`].;
'Project ['key, 'max_value]
+- Project [greatest(value1#237L, value2#238L) AS maximum_value#242L]
   +- LogicalRDD [key#236, value1#237L, value2#238L], false



In [None]:
#исправляю код путем выбора 2х колонок
exo2_4_df = spark.createDataFrame([["key", 10_000, 20_000]], ["key_col", "value1", "value2"])
exo2_4_df.printSchema()

#greatest выведет максимальное имя колонки
try:
    exo2_4_mod = (
        exo2_4_df
        .select(
            f.greatest(f.col("value1"), f.col("value2"))
            .alias("maximum_value"),
            f.col("key")
        )
    )
except AnalysisException as err:
    print(err)

удалить строки is, уалить слова длиной меньше 3

In [59]:
book = spark.read.text('/content/gdrive/MyDrive/1342-0.txt')
lines = book.select(f.split(book.value, " ").alias("line"))
words = lines.select(f.explode(f.col("line")).alias("word"))
words_lower = words.select(f.lower(f.col("word")).alias("word_lower"))
words_clean = words_lower.select(
    f.regexp_extract(f.col("word_lower"), r"[a-z]+", 0).alias("word")
)
words_nonull = words_clean.filter(f.col("word") != "")
words_nonull.show()

words_nonull = words_nonull.filter(f.col('word')!='is')
words_nonull = words_nonull.filter(f.length(f.col('word'))>3)
words_nonull.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows

+------------+
|        word|
+------------+
|     project|
|   gutenberg|
|       ebook|
|       pride|
|   prejudice|
|        jane|
|      austen|
|        this|
|       ebook|
|      anyone|
|    anywhere|
|        cost|
|        with|
|      almost|
|restrictions|
|  whatsoever|
|        copy|
|        give|
|        away|
|       under|
+------------+
only showing top 20 rows



удалить  is, not, the, if

In [61]:
words_nonull.filter(
    ~f.col('word').isin(['is','not','the','if'])
).show()

+------------+
|        word|
+------------+
|     project|
|   gutenberg|
|       ebook|
|       pride|
|   prejudice|
|        jane|
|      austen|
|        this|
|       ebook|
|      anyone|
|    anywhere|
|        cost|
|        with|
|      almost|
|restrictions|
|  whatsoever|
|        copy|
|        give|
|        away|
|       under|
+------------+
only showing top 20 rows



пофиксить баг

In [64]:
try:
    book = spark.read.text('/content/gdrive/MyDrive/1342-0.txt')
    book = book.printSchema()
    lines = book.select(f.split(book.value, " ").alias("line"))
    words = lines.select(f.explode(f.col("line")).alias("word"))
except AnalysisException as err:
  print(err)

root
 |-- value: string (nullable = true)



In [None]:
try:
    book = spark.read.text('/content/gdrive/MyDrive/1342-0.txt')
    book = book.printSchema()
    lines = book.select(f.split(book.value, " ").alias("line"))
    words = lines.select(f.explode(f.col("line")).alias("word"))
except AnalysisException as err:
  print(err)

root
 |-- value: string (nullable = true)

