In [0]:
from pyspark.sql.functions import to_date, to_timestamp, desc, when, col, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType, DateType
import urllib.request
import tempfile

In [0]:
def read_parquet(pUrl):
    url = pUrl
    response = urllib.request.urlopen(url)
    data = response.read()
    temp_file_path = '/tmp/temp_OriginaisNetflix.parquet'
    with open(temp_file_path, 'wb') as temp_file:
        temp_file.write(data)
    df = spark.read.parquet("file://" + temp_file_path)
    return df

In [0]:
df_OriginaisNetflix = read_parquet('https://drive.google.com/u/0/uc?id=1mldklSPbqL093nHjlJsvI7d1-7Ffn6pb&export=download')

In [0]:
premiere_format = "d-MMM-yy"
dt_inclusao_format = "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"

df_OriginaisNetflix = (df_OriginaisNetflix.withColumn("Premiere", to_date(df_OriginaisNetflix["Premiere"], premiere_format))
                                          .withColumn("dt_inclusao", to_timestamp(df_OriginaisNetflix["dt_inclusao"],dt_inclusao_format)))

In [0]:
df_OriginaisNetflix = df_OriginaisNetflix.orderBy(desc("Active"), desc("Genre"))

In [0]:
df_OriginaisNetflix = df_OriginaisNetflix.dropDuplicates()
df_OriginaisNetflix = df_OriginaisNetflix.withColumn("Seasons", when(col("Seasons") == "TBA", "a ser anunciado").otherwise(col("Seasons")))

In [0]:
df_OriginaisNetflix = df_OriginaisNetflix.withColumn("Data de Alteração", current_timestamp())

In [0]:
colunas = {
    "Title": "Título",
    "Genre": "Gênero",
    "GenreLabels": "RótulosDeGênero",
    "Premiere": "PréEstreia",
    "Seasons": "Estações",
    "SeasonsParsed": "TemporadasAnalisadas",
    "EpisodesParsed": "EpisódiosAnalisados",
    "Length": "Comprimento",
    "MinLength": "ComprimentoMínimo",
    "MaxLength": "ComprimentoMáximo",
    "Status": "Status",
    "Active": "Ativo",
    "Table": "Tabela",
    "Language": "Linguagem"
}

df_OriginaisNetflix = df_OriginaisNetflix.toDF(*[colunas.get(c, c) for c in df_OriginaisNetflix.columns])

In [0]:
df_OriginaisNetflix_Csv = df_OriginaisNetflix.select('Título', 'Gênero', 'Estações', 'PréEstreia', 'Linguagem', 'Ativo', 'Status', 'dt_inclusao', 'Data de Alteração')

In [0]:
df_OriginaisNetflix_Csv.write.option("delimiter", ";").mode("overwrite").csv('/FileStore/Confitec.csv')

In [0]:
schema = StructType([
    StructField('Título', StringType(), nullable=False),
    StructField('Gênero', StringType(), nullable=False),
    StructField('Estações', StringType(), nullable=True),
    StructField('PréEstreia', DateType(), nullable=True),
    StructField('Linguagem', StringType(), nullable=True),
    StructField('Ativo', LongType(), nullable=True),
    StructField('Status', StringType(), nullable=True),
    StructField('dt_inclusao', TimestampType(), nullable=True),
    StructField('Data de Alteração', TimestampType(), nullable=True)
])

In [0]:
df = spark.read.csv('/FileStore/Confitec.csv', header=True, schema=schema)
df_OriginaisNetflix_Csv.write.option("delimiter", ";").mode("overwrite").csv('/FileStore/Confitec.csv')
df_validacao_csv = spark.read.option("delimiter", ";").csv('/FileStore/Confitec.csv', header=True, schema=schema)

In [0]:
display(df_validacao_csv)

Título,Gênero,Estações,PréEstreia,Linguagem,Ativo,Status,dt_inclusao,Data de Alteração
Kulipari: Dream Walker,childrens-animation,"1 season, 10 episodes",2018-11-20,English,1,Pending,2021-03-17T00:20:24.167+0000,2023-07-01T17:08:14.823+0000
Jimmy: The True Story of a True Idiot,Comedy,"1 season, 9 episodes",2018-07-20,Japanese,1,Pending,2021-03-17T00:20:24.167+0000,2023-07-01T17:08:14.823+0000
Cable Girls,Period drama,"3 seasons, 24 episodes",2017-04-28,Spanish,1,Renewed,2021-03-17T00:20:24.167+0000,2023-07-01T17:08:14.823+0000
The Get Down,Musical drama,"2 parts, 11 episodes",2016-08-12,English,0,Ended,2021-03-17T00:20:24.167+0000,2023-07-01T17:08:14.823+0000
Nailed It! Mexico,Reality,"1 season, 6 episodes",2019-02-08,English,1,Pending,2021-03-17T00:20:24.167+0000,2023-07-01T17:08:14.823+0000
Fuller House,Sitcom,"4 seasons, 57 episodes",2016-02-26,English,1,Renewed,2021-03-17T00:20:24.167+0000,2023-07-01T17:08:14.823+0000
Friends from College,Comedy,"2 seasons, 16 episodes",2017-07-14,English,0,Ended,2021-03-17T00:20:24.167+0000,2023-07-01T17:08:14.823+0000
Trailer Park Boys Out of the Park: Europe,Mockumentary,"1 season, 8 episodes",2016-10-28,English,0,Ended,2021-03-17T00:20:24.167+0000,2023-07-01T17:08:14.823+0000
Brainchild,Educational,"1 season, 13 episodes",2018-11-02,English,1,Pending,2021-03-17T00:20:24.167+0000,2023-07-01T17:08:14.823+0000
Selling Sunset,Reality,"1 season, 8 episodes",2019-03-22,English,1,Pending,2021-03-17T00:20:24.167+0000,2023-07-01T17:08:14.823+0000


## Processo para gravar o CSV no Bucket s3

In [0]:
"""
import boto3
import botocore.exceptions

bucket_name = 'nome-do-bucket'
file_name = 'nome-do-arquivo.csv'
s3_path = f's3a://{bucket_name}/{file_name}'

df.write.option("header", "true").mode("overwrite").csv(s3_path)
"""

Out[160]: '\nimport boto3\nimport botocore.exceptions\n\nbucket_name = \'nome-do-bucket\'\nfile_name = \'nome-do-arquivo.csv\'\ns3_path = f\'s3a://{bucket_name}/{file_name}\'\n\ndf.write.option("header", "true").mode("overwrite").csv(s3_path)\n'