# TRABALHO PRÁTICO - Spark

In [1]:
# Primeiro: necessário executar o SPARK via CMD (Anaconda - IGTI) - usar o comando "pyspark"

In [2]:
# Importa a função para achar o SPARK sendo executado via CONDA
import findspark
findspark.init()

In [3]:
# Importa a biblioteca para criar a sessão spark
from pyspark.sql import SparkSession

In [4]:
# Cria a sessão spark para poder trabalhar com os dados
spark = SparkSession.builder.getOrCreate()

In [5]:
# Mostra a sessão pyspark
print (spark)

<pyspark.sql.session.SparkSession object at 0x0000024A95190C50>


# INICIANDO O TRABALHO PRÁTICO

In [6]:
# Importa a biblioteca de manipulação dos DF
from pyspark.sql.types import *
import pyspark.sql.functions as f

In [7]:
# Cria o Dataframe para a tabela de título dos filmes
df_titles = spark.read.csv('title_basics.tsv', header=True, sep='\t')

In [8]:
# Cria o Dataframe para a tabela de notas dos filmes
df_ratings = spark.read.csv('title_ratings.tsv', header=True, sep='\t')

In [9]:
df_titles.show(10)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [10]:
df_ratings.show(10)

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1809|
|tt0000002|          6.0|     233|
|tt0000003|          6.5|    1560|
|tt0000004|          6.1|     152|
|tt0000005|          6.2|    2383|
|tt0000006|          5.1|     157|
|tt0000007|          5.4|     746|
|tt0000008|          5.5|    1965|
|tt0000009|          5.8|     189|
|tt0000010|          6.9|    6530|
+---------+-------------+--------+
only showing top 10 rows



In [11]:
# QUESTÃO 5
df_titles.filter((f.col('startYear') == '2015') & ((f.col('titleType') == 'movie') | (f.col('titleType') == 'tvMovie'))).count()

19987

In [12]:
# QUESTÃO 5 - Outra maneira de responder
(
df_titles
    .filter(f.col('titleType').isin('movie','tvMovie'))
    .filter(f.col('startYear') == '2015')
    .count()
)

19987

In [84]:
# QUESTÃO 6
df_titles_subset = (
    df_titles
    .filter('titleType = "movie"')
    .withColumn('genres_array', f.split('genres', ','))
    .withColumn('genres_explode', f.explode(f.col('genres_array')))
    .groupby('genres_explode')
    .count()
    .sort(f.col("count").desc())
    .toPandas()
)
df_titles_subset

Unnamed: 0,genres_explode,count
0,Drama,210839
1,Documentary,107661
2,Comedy,100023
3,\N,70612
4,Action,50221
5,Romance,43871
6,Thriller,39222
7,Crime,33810
8,Horror,31968
9,Adventure,26528


In [88]:
# QUESTÃO 11
(
    df_titles
    .withColumn('n_titles', f.count(f.lit(1)).over(Window.partitionBy('startYear')))
    .withColumn('genres', f.explode(f.split('genres', ',')))
    .filter('startYear = 2018')
    .groupBy('startYear', 'genres', 'n_titles')
    .count()
    .withColumn('percentual', f.col('count')/f.col('n_titles'))
    .toPandas()
)

Unnamed: 0,startYear,genres,n_titles,count,percentual
0,2018,Drama,402244,89367,0.222171
1,2018,Comedy,402244,78809,0.195923
2,2018,\N,402244,41740,0.103768
3,2018,Documentary,402244,37840,0.094072
4,2018,Short,402244,66482,0.165278
5,2018,Action,402244,15575,0.03872
6,2018,Family,402244,16997,0.042255
7,2018,Sci-Fi,402244,5644,0.014031
8,2018,Thriller,402244,8154,0.020271
9,2018,Romance,402244,25002,0.062156
