In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, abs
import os
import requests
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('Entrega Fabio').enableHiveSupport().getOrCreate()

In [3]:
spark

In [4]:
url = "https://raw.githubusercontent.com/joaopmts/Data-Engineering/main/Distributed-Data-Processing-and-Storage/REPO/results%20(1).csv"

df = pd.read_csv(url)

results = spark.createDataFrame(df)
results.show()

HTTPError: HTTP Error 404: Not Found

In [5]:
results.show(2)

+----------+-------------+-------------+--------------+--------------+--------------+--------+--------------+-----------+
|      date|home_teamName|away_teamName|home_scoreHome|away_scoreAway|tournamentName|cityCity|countryCountry|neutralTRUE|
+----------+-------------+-------------+--------------+--------------+--------------+--------+--------------+-----------+
|1872-11-30|     Scotland|      England|             0|             0|      Friendly| Glasgow|      Scotland|      false|
|1873-03-08|      England|     Scotland|             4|             2|      Friendly|  London|       England|      false|
+----------+-------------+-------------+--------------+--------------+--------------+--------+--------------+-----------+
only showing top 2 rows



In [6]:
results = (
    results
    .drop('neutralTrue')
    .withColumn('home_scoreHome',col('home_scoreHome').cast('int'))
    .withColumn('away_scoreAway',col('away_scoreAway').cast('int'))
)

In [7]:
results.printSchema()

root
 |-- date: string (nullable = true)
 |-- home_teamName: string (nullable = true)
 |-- away_teamName: string (nullable = true)
 |-- home_scoreHome: integer (nullable = true)
 |-- away_scoreAway: integer (nullable = true)
 |-- tournamentName: string (nullable = true)
 |-- cityCity: string (nullable = true)
 |-- countryCountry: string (nullable = true)



In [8]:
# 1 - Quantos registros existem na base?

In [9]:
results.count()

40839

In [10]:
# 2 - Quantas equipes únicas mandantes existem na base?

In [11]:
results.dropDuplicates(['home_teamName']).count()

309

In [12]:
# 3 - Quantas vezes as equipes mandantes saíram vitoriosas

In [13]:
results.filter(results.home_scoreHome > results.away_scoreAway).count()

19864

In [14]:
# 4 - Quantas vezes as equipes visitantes saíram vitoriosas?

In [15]:
results.filter(results.home_scoreHome < results.away_scoreAway).count()

11544

In [16]:
# 5 - Quantas partidas resultaram em empate?

In [17]:
results.filter(results.home_scoreHome == results.away_scoreAway).count()

9431

In [18]:
#6 - Quantas partidas foram realizadas em cada país?

In [19]:
results.groupBy(['countryCountry']).count().orderBy('count', ascending=False).show()

+--------------------+-----+
|      countryCountry|count|
+--------------------+-----+
|       United States| 1144|
|              France|  801|
|             England|  687|
|            Malaysia|  644|
|              Sweden|  637|
|             Germany|  581|
|              Brazil|  529|
|               Spain|  517|
|            Thailand|  483|
|               Italy|  480|
|         Switzerland|  477|
|             Austria|  475|
|United Arab Emirates|  472|
|        South Africa|  470|
|               Qatar|  467|
|         South Korea|  453|
|           Argentina|  449|
|             Hungary|  431|
|               Chile|  405|
|             Belgium|  396|
+--------------------+-----+
only showing top 20 rows



In [20]:
# 7 - Qual país teve mais partidas? - Considerei o pais time, não pais localidade

In [21]:
df1 = results.groupBy(['home_teamName']).count().withColumnRenamed("count", "jogos_casa")
df2 = results.groupBy(['away_teamName']).count().withColumnRenamed("count", "jogos_fora")
dfjoin = df1.join(df2, df1.home_teamName == df2.away_teamName, how = 'outer')
dfjoin = dfjoin.withColumn('total_partidas', col('jogos_casa') + col('jogos_fora')).withColumnRenamed('home_teamName','time')
dfjoin = dfjoin.select('time','total_partidas')
dfjoin.orderBy('total_partidas', ascending=False).show(1)

+------+--------------+
|  time|total_partidas|
+------+--------------+
|Sweden|          1010|
+------+--------------+
only showing top 1 row



In [22]:
# 8 - Qual a partida com maior número de gols?

In [23]:
results.withColumn('total_gols', col('home_scoreHome') + col('away_scoreAway')).orderBy('total_gols', ascending=False).show(1)

+----------+-------------+--------------+--------------+--------------+--------------------+-------------+--------------+----------+
|      date|home_teamName| away_teamName|home_scoreHome|away_scoreAway|      tournamentName|     cityCity|countryCountry|total_gols|
+----------+-------------+--------------+--------------+--------------+--------------------+-------------+--------------+----------+
|2001-04-11|    Australia|American Samoa|            31|             0|FIFA World Cup qu...|Coffs Harbour|     Australia|        31|
+----------+-------------+--------------+--------------+--------------+--------------------+-------------+--------------+----------+
only showing top 1 row



In [24]:
# 9 - Qual a maior goleada?

In [25]:
results.withColumn('diff', abs(col('home_scoreHome') - col('away_scoreAway'))).orderBy('diff', ascending=False).show(1)

+----------+-------------+--------------+--------------+--------------+--------------------+-------------+--------------+----+
|      date|home_teamName| away_teamName|home_scoreHome|away_scoreAway|      tournamentName|     cityCity|countryCountry|diff|
+----------+-------------+--------------+--------------+--------------+--------------------+-------------+--------------+----+
|2001-04-11|    Australia|American Samoa|            31|             0|FIFA World Cup qu...|Coffs Harbour|     Australia|  31|
+----------+-------------+--------------+--------------+--------------+--------------------+-------------+--------------+----+
only showing top 1 row



In [26]:
# 10 - Quantos jogos ocorreram no Brasil?

In [27]:
results.where(col('countryCountry') == 'Brazil').groupBy(['countryCountry']).count().show()

+--------------+-----+
|countryCountry|count|
+--------------+-----+
|        Brazil|  529|
+--------------+-----+

