In [3]:
%%capture
!pip install pyspark

Принято ипортировать функции как f

In [21]:
import numpy as np
import pyspark.sql.functions as f
import pyspark.sql.types as t

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.utils import AnalysisException
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
spark = (
    SparkSession
    .builder
    .appName("app")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

#### чтение
в параметрах указываем, что есть шапка э таблицы, разделитель имеет вид |
inferSchema позволяет спарк определить типы данных
timestampformat инфурмирует какой вид надо задать столбцам с датами

In [16]:
logs = (
    spark
    .read
    .csv(
        '/content/gdrive/MyDrive/BroadcastLogs_2018_Q3_M8_sample.CSV',
        sep='|',
        header=True,
        inferSchema=True,
        timestampFormat='yyyy-MM-dd'
    )
)
logs.printSchema()

root
 |-- BroadcastLogID: integer (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: date (nullable = true)
 |-- SequenceNO: integer (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: date (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string (nullable 

#### Выбор нескольких столбцов

In [19]:
logs.select("BroadcastLogID", "LogServiceID", "LogDate").show(5, False)

+--------------+------------+----------+
|BroadcastLogID|LogServiceID|LogDate   |
+--------------+------------+----------+
|1196192316    |3157        |2018-08-01|
|1196192317    |3157        |2018-08-01|
|1196192318    |3157        |2018-08-01|
|1196192319    |3157        |2018-08-01|
|1196192320    |3157        |2018-08-01|
+--------------+------------+----------+
only showing top 5 rows



несколько способов

In [20]:
logs.select("BroadCastLogID", "LogServiceID", "LogDate")
logs.select(*["BroadCastLogID", "LogServiceID", "LogDate"])
logs.select(
    f.col("BroadCastLogID"), f.col("LogServiceID"), f.col("LogDate")
)
logs.select(
    *[f.col("BroadCastLogID"), f.col("LogServiceID"), f.col("LogDate")]
)

DataFrame[BroadCastLogID: int, LogServiceID: int, LogDate: date]

Можно разбить массив с именами колонок, по 3 , чтобы вывести например вторые 3 колонки
Использование * для распаковки итерируемых объектов в список/кортеж

In [22]:
column_split = np.array_split(
    np.array(logs.columns), len(logs.columns) // 3
)

for x in column_split:
  print(x)

logs.select(*column_split[1]).show(5, False) #т.е в метод подаем не [1,2,3], а 1,2,3

['BroadcastLogID' 'LogServiceID' 'LogDate']
['SequenceNO' 'AudienceTargetAgeID' 'AudienceTargetEthnicID']
['CategoryID' 'ClosedCaptionID' 'CountryOfOriginID']
['DubDramaCreditID' 'EthnicProgramID' 'ProductionSourceID']
['ProgramClassID' 'FilmClassificationID' 'ExhibitionID']
['Duration' 'EndTime' 'LogEntryDate']
['ProductionNO' 'ProgramTitle' 'StartTime']
['Subtitle' 'NetworkAffiliationID' 'SpecialAttentionID']
['BroadcastOriginPointID' 'CompositionID' 'Producer1']
['Producer2' 'Language1' 'Language2']
+----------+-------------------+----------------------+
|SequenceNO|AudienceTargetAgeID|AudienceTargetEthnicID|
+----------+-------------------+----------------------+
|1         |4                  |NULL                  |
|2         |NULL               |NULL                  |
|3         |NULL               |NULL                  |
|4         |NULL               |NULL                  |
|5         |NULL               |NULL                  |
+----------+-------------------+------------

#### Удаление
можно также использовать * для удаления списка

In [23]:
logs = logs.drop("BroadcastLogID", "SequenceNO")

In [25]:
print("BroadcastLogID" in logs.columns)
print("SequenceNO" in logs.columns)

False
False


In [26]:
#альтернатива
logs = logs.select(
    *[x for x in logs.columns if x not in ["BroadcastLogID", "SequenceNO"]]
)

#### Работа с датой
дата в нашем df хранится как строка
из вывода видно, что у нас часы, минуты, секунды и тд

In [27]:
display(logs.select(f.col('Duration')).show(5))
display(logs.select(f.col('Duration')).dtypes)

+----------------+
|        Duration|
+----------------+
|02:00:00.0000000|
|00:00:30.0000000|
|00:00:15.0000000|
|00:00:15.0000000|
|00:00:15.0000000|
+----------------+
only showing top 5 rows



None

[('Duration', 'string')]

извлекаем информацию из даты.
методом substr берем символы, где расположены часы, то есть substr(1, 2) минуты (4,2)
метод cast преобразует строку в число и вообще меняет тип данных
distinct удалили дубликаты для удобного просмотра

In [30]:
logs.select(
    f.col("Duration"),
    f.col("Duration").substr(1, 2).cast("int").alias("dur_hours"),
    f.col("Duration").substr(4, 2).cast("int").alias("dur_minutes"),
    f.col("Duration").substr(7, 2).cast("int").alias("dur_seconds"),
).distinct().show(5)

+----------------+---------+-----------+-----------+
|        Duration|dur_hours|dur_minutes|dur_seconds|
+----------------+---------+-----------+-----------+
|00:04:52.0000000|        0|          4|         52|
|00:10:06.0000000|        0|         10|          6|
|00:26:41.0000000|        0|         26|         41|
|00:05:29.0000000|        0|          5|         29|
|00:08:18.0000000|        0|          8|         18|
+----------------+---------+-----------+-----------+
only showing top 5 rows



Вспарке доступны арифметические операции со столбцами.
Считаем, сколько секунд всего

In [31]:
logs.select(
    f.col('Duration'),
    (
        f.col('Duration').substr(1,2).cast('int')*60*60 +
        f.col('Duration').substr(4,2).cast('int')*60 +
        f.col('Duration').substr(7,2).cast('int')
    )
    .alias('total_sec')
).show(5)

+----------------+---------+
|        Duration|total_sec|
+----------------+---------+
|02:00:00.0000000|     7200|
|00:00:30.0000000|       30|
|00:00:15.0000000|       15|
|00:00:15.0000000|       15|
|00:00:15.0000000|       15|
+----------------+---------+
only showing top 5 rows



Более изящный вариант добавления колонки
withColumn('имя', колонка), добавит колонку в конец df
если имя уже существует в df, то pyspark перезапишет столбец

В целом select используем, когда создаем несколько колонок сразу, а withColumn, когда нужно перезаписать либо добавить в конец одну

In [34]:
logs = logs.withColumn(
    'Total_sec',
    f.col('Duration').substr(1,2).cast('int')*60*60 +
    f.col('Duration').substr(4,2).cast('int')*60 +
    f.col('Duration').substr(7,2).cast('int')
)
logs.printSchema(5)

root
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: date (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: date (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string (nullable = true)
 |-- NetworkAffiliationID: integer (nullable = true)
 |-- SpecialAttentionID: inte

#### Переименование столбцов

In [35]:
logs = logs.withColumnRenamed("Total_sec", "total_sec")
logs.printSchema()

root
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: date (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: date (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string (nullable = true)
 |-- NetworkAffiliationID: integer (nullable = true)
 |-- SpecialAttentionID: inte

переименовать все столбцы в нижний регистр
toDf принимает список новых имен, который также надо рспаковать звездой

In [37]:
logs.toDF(*[x.lower() for x in logs.columns]).printSchema()

root
 |-- logserviceid: integer (nullable = true)
 |-- logdate: date (nullable = true)
 |-- audiencetargetageid: integer (nullable = true)
 |-- audiencetargetethnicid: integer (nullable = true)
 |-- categoryid: integer (nullable = true)
 |-- closedcaptionid: integer (nullable = true)
 |-- countryoforiginid: integer (nullable = true)
 |-- dubdramacreditid: integer (nullable = true)
 |-- ethnicprogramid: integer (nullable = true)
 |-- productionsourceid: integer (nullable = true)
 |-- programclassid: integer (nullable = true)
 |-- filmclassificationid: integer (nullable = true)
 |-- exhibitionid: integer (nullable = true)
 |-- duration: string (nullable = true)
 |-- endtime: string (nullable = true)
 |-- logentrydate: date (nullable = true)
 |-- productionno: string (nullable = true)
 |-- programtitle: string (nullable = true)
 |-- starttime: string (nullable = true)
 |-- subtitle: string (nullable = true)
 |-- networkaffiliationid: integer (nullable = true)
 |-- specialattentionid: inte

#### Порядок
изменение порядка колонок, чтобы было по алфавиту

In [38]:
logs.select(sorted(logs.columns)).printSchema()

root
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- BroadcastOriginPointID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CompositionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- Language1: integer (nullable = true)
 |-- Language2: integer (nullable = true)
 |-- LogDate: date (nullable = true)
 |-- LogEntryDate: date (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- NetworkAffiliationID: integer (nullable = true)
 |-- Producer1: string (nullable = true)
 |-- Producer2: string (nullable = true)
 |-- ProductionNO: string (nu

#### describe
из пандас.
Неподходящие типы он игнорирует
count показывает только не пустые столбцы

In [40]:
for column in logs.columns[:3]:
  logs.describe(column).show()

+-------+------------------+
|summary|      LogServiceID|
+-------+------------------+
|  count|            238945|
|   mean| 3450.890284375065|
| stddev|199.50673962555592|
|    min|              3157|
|    max|              3925|
+-------+------------------+

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

+-------+-------------------+
|summary|AudienceTargetAgeID|
+-------+-------------------+
|  count|              16112|
|   mean| 3.4929245283018866|
| stddev| 1.0415963394745125|
|    min|                  1|
|    max|                  4|
+-------+-------------------+



In [41]:
logs.describe(*logs.columns[:3]).show()

+-------+------------------+-------------------+
|summary|      LogServiceID|AudienceTargetAgeID|
+-------+------------------+-------------------+
|  count|            238945|              16112|
|   mean| 3450.890284375065| 3.4929245283018866|
| stddev|199.50673962555592| 1.0415963394745125|
|    min|              3157|                  1|
|    max|              3925|                  4|
+-------+------------------+-------------------+



summary - расширенный метод с квантилями
параметры summary можно регулировать

In [42]:
print(logs.select(logs.columns[:3]).summary().show())
print(logs.select(logs.columns[:3]).summary("min", "10%", "90%", "max").show())

+-------+------------------+-------------------+
|summary|      LogServiceID|AudienceTargetAgeID|
+-------+------------------+-------------------+
|  count|            238945|              16112|
|   mean| 3450.890284375065| 3.4929245283018866|
| stddev|199.50673962555592| 1.0415963394745125|
|    min|              3157|                  1|
|    25%|              3287|                  4|
|    50%|              3379|                  4|
|    75%|              3627|                  4|
|    max|              3925|                  4|
+-------+------------------+-------------------+

None
+-------+------------+-------------------+
|summary|LogServiceID|AudienceTargetAgeID|
+-------+------------+-------------------+
|    min|        3157|                  1|
|    10%|        3236|                  1|
|    90%|        3709|                  4|
|    max|        3925|                  4|
+-------+------------+-------------------+

None


#### Задание  
прочитать файл  
Чтобы узнать, какие аргументы за что отвечают в методе spark.read.scv, требуется найти этот метод в документации, потом зайти в исходный код и там будет напиисана фраза:  

Other Parameters  
Extra options
For the extra options, refer to
`Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_
for the version you use.

.. # noqa


In [48]:
(
    spark
    .read
    .csv(
        '/content/gdrive/MyDrive/sample.csv',
        header=True
    )
).show()

+----------+
|old_column|
+----------+
|         1|
|         4|
|         4|
|         5|
|         7|
|         7|
|         7|
|        10|
|        14|
|         1|
|         4|
|         8|
+----------+



Выберите только те колонки, которые не заканчиваются на ID

In [54]:
logs.select([col for col in logs.columns if col[-2:]!='ID'])

DataFrame[LogDate: date, Duration: string, EndTime: string, LogEntryDate: date, ProductionNO: string, ProgramTitle: string, StartTime: string, Subtitle: string, Producer1: string, Producer2: string, Language1: int, Language2: int, total_sec: int]