## Mini Project
- For the project we'll be using OfficeDataProject.csv
- Read data from the file in the DF and perform following analytics on it.
    - Print the total number of employees in the company
    - Print the total number of departments in the company
    - Print the department names of the company
    - Print the total number of employees in each department
    - Print the total number of employees in each state
    - Print the total number of employees in each state in each department
    - Print the minimum and maximum salaries in each department and sort salaries in ascending order
    - Print the names of employees working in NY state under Finance department whose bonuses are greater than the average bonuses of employees in NY state
    - Raise the salaries $500 of all employees whose age is greater than 45
    - Create DF of all those employees whose age is greater than 45 and save them in a file

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, udf
from pyspark.sql.functions import sum, avg, max, min, mean, count

spark = SparkSession.builder.appName('Spark DataFrame Project').getOrCreate()
df = spark.read.options(inferSchema='True',header='True',delimiter=',').csv('data/OfficeDataProject.csv')
df.show(5)

In [None]:
# total number of employees in the company
df.select(df.employee_id).count()

In [None]:
# total number of departments in the company
df.select(df.department).distinct().count()

In [None]:
# the department names of the company
df.select(df.department).distinct().show()

In [None]:
# total number of employees in each department
df.groupBy(df.department).agg(count(df.employee_id).alias("total_employees_in_department")).show()

In [None]:
# total number of employees in each state
df.groupBy(df.state).agg(count(df.employee_id).alias("total_employees_each_sate")).show()

In [None]:
# total number of employees in each state in each department
df.groupBy(df.state, df.department).agg(count(df.employee_id).alias("total_employees")).show()

In [None]:
# minimum and maximum salaries in each department and sort salaries in ascending order
df.groupBy(df.department).agg(min(df.salary).alias('min'), max(df.salary).alias('max')).orderBy(col('min').asc(), col('max').asc()).show()

In [None]:
# names of employees working in NY state under Finance department whose bonuses are greater than the average bonuses of employees in NY state
df_avg = df.filter(df.state=='NY').groupBy(df.state).agg(mean(df.bonus).alias('avg_bonus')).select(col('avg_bonus'))

In [None]:
df_avg.collect()[0]['avg_bonus']

In [None]:
df.filter((df.state == 'NY') & (df.department == 'Finance') & (df.bonus > df_avg.collect()[0]['avg_bonus'])).show()

In [None]:
# Raise the salaries $500 of all employees whose age is greater than 45
from pyspark.sql. types import StructType, StructField, StringType, IntegerType
def raise_salary_corr_age(age, salary):
    if age > 45:
        return salary + 500
    return salary
    
RaiseSalaryCorrAge = udf(lambda x, y: raise_salary_corr_age(x, y), IntegerType())
    
df1 = df.withColumn('salary', RaiseSalaryCorrAge(df.age, df.salary))
df.show(5)

In [None]:
df1.show(5)

In [None]:
# Create DF of all those employees whose age is greater than 45 and save them in a file
df_save = df1.filter(df1.age > 45)
df_save.write.mode('overwrite').options(header='True').csv('data/output/project_df')
df_read = spark.read.options(header='True',inferSchema='True').csv('data/output/project_df')
df_read.show(5)