In [13]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('egd').getOrCreate()

In [14]:
import pyspark.sql.functions as F
import os
from pyspark.sql.functions import col, lit

singles=spark.read.option("header","true").option("inferSchema","true") \
.csv('singles')

singles = singles.withColumn(
    "Origem",  # Replace with your desired column name
    lit('Singles')  # Use lit() to create a literal value expression
)

doubles=spark.read.option("header","true").option("inferSchema","true") \
.csv('doubles')

doubles = doubles.withColumn(
    "Origem",  # Replace with your desired column name
    lit('Doubles')  # Use lit() to create a literal value expression
)

futures=spark.read.option("header","true").option("inferSchema","true") \
.csv('futures')

futures = futures.withColumn(
    "Origem",  # Replace with your desired column name
    lit('Futures')  # Use lit() to create a literal value expression
)

qual_chal = spark.read.option("header","true").option("inferSchema","true") \
.csv('qual_chal')

qual_chal = qual_chal.withColumn(
    "Origem",  # Replace with your desired column name
    lit('Qualificacao_Challenger')  # Use lit() to create a literal value expression
)

amat = spark.read.option("header","true").option("inferSchema","true") \
.csv('atp_matches_amateur.csv')

amat = amat.withColumn(
    "Origem",  # Replace with your desired column name
    lit('Amateur')  # Use lit() to create a literal value expression
)

In [15]:
matches_completos = singles.unionAll(futures)
matches_completos = matches_completos.unionAll(qual_chal)
matches_completos=matches_completos.unionAll(amat)

In [16]:
colunas_unicas_doubles = list(set(doubles.columns) - set(matches_completos.columns))

for a in colunas_unicas_doubles:
    matches_completos = matches_completos.withColumn(
    a,  # Replace with your desired column name
    lit(None)  # Use lit() to create a literal value expression
)

In [17]:
colunas_unicas_matches_completos = list(set(matches_completos.columns) - set(doubles.columns))

for a in colunas_unicas_matches_completos:
    doubles = doubles.withColumn(
    a,  # Replace with your desired column name
    lit(None)  # Use lit() to create a literal value expression
)

In [18]:
matches_completos=matches_completos.unionAll(doubles)

In [19]:
len(matches_completos.columns)

82

In [20]:
matches_completos.select('tourney_id').show()
matches=matches_completos

+----------+
|tourney_id|
+----------+
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
|  1994-339|
+----------+
only showing top 20 rows



In [21]:
#END HELENA,START LARA

In [22]:
from pyspark.sql.functions import col
# List all columns with the type 'void' and Convert to integer
void_columns = [col_name for col_name, data_type in matches.dtypes if data_type == 'void']
for col_name in void_columns:
    matches = matches.withColumn(col_name, matches[col_name].cast("int"))

In [23]:
# Convert to integers
columns_to_convert = ["draw_size", "loser_id","loser_seed", "best_of", "minutes", "winner_seed",
    "w_ace", "w_df", "w_svpt", "w_1stIn", "w_1stWon", "w_2ndWon", "w_SvGms", "w_bpSaved", "w_bpFaced",
    "l_ace", "l_df", "l_svpt", "l_1stIn", "l_1stWon", "l_2ndWon", "l_SvGms", "l_bpSaved", "l_bpFaced",
    "winner_rank", "loser_rank", "winner_rank_points", "loser_rank_points", "winner2_hand"
]
for column in columns_to_convert:
    matches = matches.withColumn(column, col(column).cast("integer"))

In [24]:
from pyspark.sql.functions import to_date
matches = matches.withColumn("tourney_date", to_date(matches["tourney_date"].cast("string"), "yyyyMMdd"))
matches = matches.withColumn("winner_age", matches["winner_age"].cast("double"))

In [38]:
from pyspark.sql.functions import split, when, col
from pyspark.sql.types import MapType, StringType

# Define a função para extrair os valores dos sets
def extrair_sets(scores):
    sets = scores.split(' ')  
    set_values = {}  # Dicionário para armazenar os valores de cada set

    for i, set_score in enumerate(sets):
        try:
            set_values[f"set_{i+1}"] = int(set_score)
        except ValueError:
            set_values[f"set_{i+1}"] = set_score

    return set_values

extrair_sets_udf = F.udf(extrair_sets, MapType(StringType(), StringType()))

# nova coluna com os valores dos sets 
matches = matches.withColumn("sets", extrair_sets_udf(col("score")))
for i in range(1, 6):  
    matches = matches.withColumn(f"set_{i}", when(col("sets").getItem(f"set_{i}").isNull(), None).otherwise(col("sets").getItem(f"set_{i}")))
# Remove a coluna temporária "sets"
matches = matches.drop("sets")

In [39]:
# Selecionar as colunas set_1 até set_5 e exibir as primeiras 5 linhas
matches.select("set_1", "set_2", "set_3", "set_4", "set_5").show(1)


+-----+-----+-----+-----+-----+
|set_1|set_2|set_3|set_4|set_5|
+-----+-----+-----+-----+-----+
|  6-2|  6-2| NULL| NULL| NULL|
+-----+-----+-----+-----+-----+
only showing top 1 row



In [40]:
# Filtrar as linhas onde o valor da coluna "set_5" não é nulo e exibir os valores dessa coluna
matches.filter(matches["set_5"].isNotNull()).select("set_5").show(10)

+-----+
|set_5|
+-----+
|  6-1|
|  8-6|
|  6-2|
|  6-4|
|  6-2|
|  8-6|
|  6-3|
|  RET|
|  6-2|
|  6-4|
+-----+
only showing top 10 rows



In [26]:
matches.write.csv('all.csv', header=True)


In [35]:
matches.printSchema()

root
 |-- tourney_id: string (nullable = true)
 |-- tourney_name: string (nullable = true)
 |-- surface: string (nullable = true)
 |-- draw_size: integer (nullable = true)
 |-- tourney_level: string (nullable = true)
 |-- tourney_date: date (nullable = true)
 |-- match_num: integer (nullable = true)
 |-- winner_id: integer (nullable = true)
 |-- winner_seed: integer (nullable = true)
 |-- winner_entry: string (nullable = true)
 |-- winner_name: string (nullable = true)
 |-- winner_hand: string (nullable = true)
 |-- winner_ht: integer (nullable = true)
 |-- winner_ioc: string (nullable = true)
 |-- winner_age: double (nullable = true)
 |-- loser_id: integer (nullable = true)
 |-- loser_seed: integer (nullable = true)
 |-- loser_entry: string (nullable = true)
 |-- loser_name: string (nullable = true)
 |-- loser_hand: string (nullable = true)
 |-- loser_ht: integer (nullable = true)
 |-- loser_ioc: string (nullable = true)
 |-- loser_age: double (nullable = true)
 |-- score: string (nul

In [41]:
#DÙVIDAS:
#VERIFICAR PROBLEMA AO CONTAR OS NULLS
matches.filter(matches["loser1_name"].isNotNull()).select("loser1_name").show(10)
#Não há nada nesta coluna, nunca? tb não deverã haver noutras do mesmo género

+-----------+
|loser1_name|
+-----------+
+-----------+



In [43]:
matches.filter(matches["winner1_age"].isNotNull()).select("winner1_age").show(10)

+-----------+
|winner1_age|
+-----------+
+-----------+

