# Chargement de spark

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

## Lecture de la donnée

In [2]:
df = spark.read.json('../data.json')
df.collect()

[Row(author='CBS Minnesota', category=['regional', 'detroit'], description="&&&&&&&&&&(CBS)-\xa0Super Bowl Greatest Commercials returns for it's landmark 20th anniversary tonight at 8:00PM ET/PT on CBS and streaming on CBS All Access.\nCBS' Matt Weiss spoke with Executive Producer Bob Horowitz to discuss the special, this year's commercials and his all-time favorite spot.\nMW: H...", id='e6bfa6e5-7ed0-4114-bbff-0544e0223457', image='https://minnesota.cbslocal.com/wp-content/uploads/sites/15909630/2021/02/SBG.jpg?w=1449', language='en', published='2021-02-03 21:00:31 +0000', title="Bob Horowitz On What Makes The Perfect Super Bowl Commercial: 'Tugs On The Heartstrings'", url='https://minnesota.cbslocal.com/2021/02/03/bob-horowitz-super-bowl-commercial-special/'),
 Row(author='CBS Minnesota', category=['regional', 'detroit'], description="&&&&&&&&&&CHICAGO (CBS) — A grieving mother reached out to WBBM-TV in Chicago a year after her daughter unexpectedly passed away in North Carolina.\xa0

In [3]:
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- id: string (nullable = true)
 |-- image: string (nullable = true)
 |-- language: string (nullable = true)
 |-- published: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [4]:
from pyspark.sql.functions import udf
from pyspark.sql.types import TimestampType, StringType

@udf(returnType = TimestampType())
def transform_timestamp_in_date(timestamp):
    from datetime import datetime
    return datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S %z")

published_df = df.withColumn("published", transform_timestamp_in_date("published"))
published_df.show(vertical = True)

-RECORD 0---------------------------
 author      | CBS Minnesota        
 category    | [regional, detroit]  
 description | &&&&&&&&&&(CBS)- ... 
 id          | e6bfa6e5-7ed0-411... 
 image       | https://minnesota... 
 language    | en                   
 published   | 2021-02-03 21:00:31  
 title       | Bob Horowitz On W... 
 url         | https://minnesota... 
-RECORD 1---------------------------
 author      | CBS Minnesota        
 category    | [regional, detroit]  
 description | &&&&&&&&&&CHICAGO... 
 id          | 8ff0d113-10d3-400... 
 image       | https://minnesota... 
 language    | en                   
 published   | 2021-02-03 21:52:26  
 title       | A Year After Daug... 
 url         | https://minnesota... 
-RECORD 2---------------------------
 author      | WCCO-TV              
 category    | [regional, detroit]  
 description | MINNEAPOLIS (WCCO... 
 id          | 3d85abe3-f54f-4f2... 
 image       | https://minnesota... 
 language    | en                   
 

In [5]:
import re

@udf(returnType = StringType())
def url_to_origin(url):
    return re.findall('(?:[-\w.]|(?:%[\da-fA-F]{2}))+',url)[1]

origin_df = df.withColumn("origin", url_to_origin("url"))

spark.catalog.dropTempView("origin_table")
origin_df.createTempView("origin_table")
spark.sql("SELECT DISTINCT(origin) FROM origin_table").collect()

[Row(origin='minnesota.cbslocal.com'), Row(origin='www.koco.com')]

In [6]:
@udf(returnType = StringType())
def delete_unicode_char(description):
    string = description.replace("\n", "")
    string_encode = string.encode("ascii", "ignore")
    string_decode = string_encode.decode()
    return string_decode
    
unicode_desc_df = df.withColumn("description", delete_unicode_char("description"))
unicode_desc_df.select('description').collect()

[Row(description="&&&&&&&&&&(CBS)-Super Bowl Greatest Commercials returns for it's landmark 20th anniversary tonight at 8:00PM ET/PT on CBS and streaming on CBS All Access.CBS' Matt Weiss spoke with Executive Producer Bob Horowitz to discuss the special, this year's commercials and his all-time favorite spot.MW: H..."),
 Row(description="&&&&&&&&&&CHICAGO (CBS)  A grieving mother reached out to WBBM-TV in Chicago a year after her daughter unexpectedly passed away in North Carolina.Debbie Heater has subpoenaed documents, and called local law enforcement in North Carolina, but still can't get clear answers about how her daughter's d..."),
 Row(description="MINNEAPOLIS (WCCO)  To add a little fun to your Super Bowl Sunday, we've created some bingo cards to play with your household or Zoom party.Before you watch the game on CBS, download the bingo cards below. And don't worry if you're not a football fanatic  the cards are easy to follow and feature ..."),
 Row(description='&&&&&&&&&&MINNE