<a href="https://colab.research.google.com/github/lhaven-dev/TP_DATA/blob/main/TP1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode, avg

# Créer une session Spark
spark = SparkSession.builder.appName("PySpark TP 1").getOrCreate()


In [None]:
# Charger le fichier CSV dans un DataFrame Spark
file_path = "/content/Rotten Tomatoes Movies.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [None]:
# Supprimer les lignes avec des valeurs nulles
df_cleaned = df.dropna()
# Convertir les dates
df_cleaned = df_cleaned.withColumn("in_theaters_date", col("in_theaters_date").cast("date"))


In [None]:
# Filtrer les valeurs de tomatometer_rating pour ne conserver que les nombres
df_filtered = df_cleaned.filter(col("tomatometer_rating").rlike("^[0-9]+$"))
# Filtrer les valeurs de runtime_in_minutes pour ne conserver que les nombres
df_filtered = df_cleaned.filter(col("runtime_in_minutes").rlike("^[0-9]+$"))

In [None]:
# Filtrer les films avec une note tomatometer inférieure à 20
low_rated_movies = df_filtered.filter(col("tomatometer_rating") < 20)

In [None]:
# Filtrer les films sortis après 2000
movies_after_2000 = df_filtered.filter(year(col("in_theaters_date")) > 2000)

In [None]:
# Calculer la note moyenne des films par studio
average_rating_by_studio = df_filtered.groupBy("studio_name").agg(avg("tomatometer_rating").alias("average_rating"))

In [None]:
# Calculer la note moyenne des films par directeur
average_rating_by_director = df_filtered.groupBy("directors").agg(avg("tomatometer_rating").alias("average_rating"))

In [None]:
# Explode the 'genres' column
df_exploded = df_filtered.withColumn("genre", explode(split(col("genre"), ", ")))

In [None]:
# Calculer la durée moyenne des films pour chaque genre
average_runtime_by_genre = df_exploded.groupBy("genre").agg(avg("runtime_in_minutes").alias("average_runtime"))

average_runtime_by_genre.show()

+--------------------+------------------+
|               genre|   average_runtime|
+--------------------+------------------+
|  Action & Adventure|109.54932912391476|
|             Romance|108.59334298118668|
|    Sports & Fitness|104.89189189189189|
|               Drama|109.51838456901748|
|         Documentary|             96.56|
|Art House & Inter...|107.61720698254364|
|       Kids & Family| 95.38277511961722|
|Science Fiction &...|107.42329545454545|
|            Classics|115.87012987012987|
|Musical & Perform...|105.33516483516483|
|Faith & Spirituality|           111.375|
|           Animation| 89.21940928270043|
|       Anime & Manga|             100.0|
|    Special Interest| 98.75935828877006|
|  Mystery & Suspense|  106.714058776807|
|          Television| 99.74285714285715|
|              Horror| 98.98755832037325|
|             Western|115.18333333333334|
|              Comedy|100.50392875851232|
|       Gay & Lesbian| 99.71428571428571|
+--------------------+------------