## Challenge 1: Salary Analysis of Data Specialists

### Load the dataset and create a DataFrame using PySpark.

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("SalaryDataset").getOrCreate()

file_path = "/data/Partially Cleaned Salary Dataset.csv"
df = spark.read.option("sep", ",").option("inferSchema", "true").option("header", "True").csv(file_path)

df = df.withColumnRenamed("_c0", "index")

df.show(5)

+-----+--------------------+--------------+-----------------+---------+---------+
|index|        Company Name|     Job Title|Salaries Reported| Location|   Salary|
+-----+--------------------+--------------+-----------------+---------+---------+
|    0|            Mu Sigma|Data Scientist|              105|Bangalore| 648573.0|
|    1|                 IBM|Data Scientist|               95|Bangalore|1191950.0|
|    2|Tata Consultancy ...|Data Scientist|               66|Bangalore| 836874.0|
|    3|    Impact Analytics|Data Scientist|               40|Bangalore| 669578.0|
|    4|           Accenture|Data Scientist|               32|Bangalore| 944110.0|
+-----+--------------------+--------------+-----------------+---------+---------+
only showing top 5 rows



### Analyze and sort the locations by their average salary. Remember to round the salaries for better readability.

In [5]:
avg_salary_by_location = df.groupBy("Location") \
    .agg(F.round(F.avg("Salary"),2).alias("average_salary")) \
    .sort(F.desc("average_salary"))

avg_salary_by_location.show()

+---------+--------------+
| Location|average_salary|
+---------+--------------+
|     Pune|     1230932.1|
|Hyderabad|    1200312.26|
|Bangalore|    1184622.13|
|   Mumbai|    1018556.45|
|New Delhi|     838629.64|
+---------+--------------+



### Calculate and sort the average salary for each job title.

In [6]:
avg_salary_by_job = df.groupBy("Job Title") \
    .agg(F.round(F.avg("Salary"),2).alias("avg_salary")) \
    .sort(F.desc("avg_salary"))

avg_salary_by_job.show()

+--------------------+----------+
|           Job Title|avg_salary|
+--------------------+----------+
|Data Science Manager| 4619021.0|
|   Data Science Lead| 4068310.0|
|Data Science Cons...| 2671464.0|
| Lead Data Scientist| 1852189.0|
|Senior Data Scien...|1766129.54|
|Software Engineer...| 1566780.0|
|Senior Machine Le...| 1473436.0|
|      Data Scientist|1411330.01|
|Machine Learning ...| 1397347.0|
|       Data Engineer|1309051.36|
|Data Science Asso...| 1203913.0|
|Machine Learning ...| 797188.36|
|Machine Learning ...|  706401.0|
|        Data Analyst| 616469.87|
|Data Scientist - ...|  610512.0|
|Junior Data Scien...| 596323.11|
|Machine Learning ...|  581119.0|
|Associate Machine...|  464372.0|
|Machine Learning ...|  383213.0|
|        Data Science| 364905.33|
+--------------------+----------+
only showing top 20 rows



### Finally, find the average salary for each location. Display this information in a clear and concise format.

In [8]:
print("Moyenne des salaires pour chaque emplacement :")
avg_salary_by_location.show(truncate=False)

Moyenne des salaires pour chaque emplacement :
+---------+--------------+
|Location |average_salary|
+---------+--------------+
|Pune     |1230932.1     |
|Hyderabad|1200312.26    |
|Bangalore|1184622.13    |
|Mumbai   |1018556.45    |
|New Delhi|838629.64     |
+---------+--------------+



In [None]:
print("Moyenne des salaires pour chaque emplacement :")
avg_salary_by_location.show(truncate=False)

In [9]:
# Convertir les résultats en dictionnaire et les afficher
print("Moyenne des salaires pour chaque emplacement :")
result_dict = [row.asDict() for row in avg_salary_by_location.collect()]
for item in result_dict:
    print(item)

Moyenne des salaires pour chaque emplacement :
{'Location': 'Pune', 'average_salary': 1230932.1}
{'Location': 'Hyderabad', 'average_salary': 1200312.26}
{'Location': 'Bangalore', 'average_salary': 1184622.13}
{'Location': 'Mumbai', 'average_salary': 1018556.45}
{'Location': 'New Delhi', 'average_salary': 838629.64}


In [10]:
# Arrêt de la SparkSession
spark.stop()