<a href="https://colab.research.google.com/github/hargagan/EDA-NYC-Taxi-Data-Analysis/blob/main/pyspark/C3_M2_Practice_exercise_assgnmnt_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('AIAdoptionAndWorkforceImpactData').getOrCreate()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df_ai = spark.read.csv('/content/drive/MyDrive/Assignments/EDA/Enterprise_GenAI_Adoption_Impact.csv', header=True, inferSchema=True)

In [5]:
df_ai.printSchema()

root
 |-- Company Name: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- GenAI Tool: string (nullable = true)
 |-- Adoption Year: integer (nullable = true)
 |-- Number of Employees Impacted: integer (nullable = true)
 |-- New Roles Created: integer (nullable = true)
 |-- Training Hours Provided: integer (nullable = true)
 |-- Productivity Change (%): double (nullable = true)
 |-- Employee Sentiment: string (nullable = true)



In [6]:
df_ai.show(5)

+--------------------+-----------+------------+----------+-------------+----------------------------+-----------------+-----------------------+-----------------------+--------------------+
|        Company Name|   Industry|     Country|GenAI Tool|Adoption Year|Number of Employees Impacted|New Roles Created|Training Hours Provided|Productivity Change (%)|  Employee Sentiment|
+--------------------+-----------+------------+----------+-------------+----------------------------+-----------------+-----------------------+-----------------------+--------------------+
| Davis LLC Pvt. Ltd.| Healthcare|         USA|   Mixtral|         2022|                        5277|                8|                    657|                   25.2|Productivity incr...|
|Roberts, Holland ...|    Telecom|South Africa|    Claude|         2023|                       18762|               17|                  23021|                   27.5|We now finish tas...|
| Roman Inc Pvt. Ltd.|Advertising|       India|    Gemi

####**Task 1:** Write a function that returns the number of rows, the number of columns, the list of unique GenAI tools used and the number of distinct industries in the dataset.

In [7]:
def row_col_genai_count_distinct_industry(df):
    num_rows = df.count()
    num_cols = len(df.columns)
    unique_genai_tools = df.select('GenAI Tool').distinct().rdd.flatMap(lambda x: x).collect()
    num_distinct_industries = df.select('Industry').distinct().count()
    return num_rows, num_cols, unique_genai_tools, num_distinct_industries

row_col_genai_count_distinct_industry(df_ai)
#

(100000, 10, ['ChatGPT', 'Gemini', 'LLaMA', 'Claude', 'Groq', 'Mixtral'], 14)

####**Task 2:** Write a function that standardises column names by converting them to lowercase and replacing spaces with underscores.

In [8]:
def standardize_column_names(df):
    new_column_names = [col.lower().replace(" ", "_") for col in df.columns]
    df = df.toDF(*new_column_names)
    return df

df_ai = standardize_column_names(df_ai)
df_ai.printSchema()
#

root
 |-- company_name: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- country: string (nullable = true)
 |-- genai_tool: string (nullable = true)
 |-- adoption_year: integer (nullable = true)
 |-- number_of_employees_impacted: integer (nullable = true)
 |-- new_roles_created: integer (nullable = true)
 |-- training_hours_provided: integer (nullable = true)
 |-- productivity_change_(%): double (nullable = true)
 |-- employee_sentiment: string (nullable = true)



####**Task 3:** Write a function that returns a dictionary with column names as keys and a count of null or missing values as values.

In [11]:
from pyspark.sql.types import StringType

def count_null_values_or_missing_values(df):
    null_or_missing_counts = {}
    for col_name, col_type in df.dtypes:
        # Count explicit nulls
        null_count = df.filter(df[col_name].isNull()).count()
        missing_value_count = 0
        # If the column is a string type, also check for empty strings
        if col_type == 'string':
            missing_value_count = df.filter(df[col_name] == '').count()
        null_or_missing_counts[col_name] = null_count + missing_value_count
    return null_or_missing_counts

count_null_values_or_missing_values(df_ai)

{'company_name': 0,
 'industry': 0,
 'country': 0,
 'genai_tool': 0,
 'adoption_year': 0,
 'number_of_employees_impacted': 0,
 'new_roles_created': 0,
 'training_hours_provided': 0,
 'productivity_change_(%)': 0,
 'employee_sentiment': 0}

####**Task 4:** Write a function that casts adoption_year to IntegerType, productivity_change to FloatType and training_hours_provided and number_of_employees_impacted to IntegerType.

In [15]:
def cast_adoptions_year_productivity_change_training_hours_number_of_employees(df):
    df = df.withColumn("adoption_year", df["adoption_year"].cast("integer"))
    df = df.withColumn("productivity_change_(%)", df["productivity_change_(%)"].cast("float"))
    df = df.withColumn("training_hours_provided", df["training_hours_provided"].cast("integer"))
    df = df.withColumn("number_of_employees_impacted", df["number_of_employees_impacted"].cast("integer"))
    return df

df_ai = cast_adoptions_year_productivity_change_training_hours_number_of_employees(df_ai)
df_ai.printSchema()


root
 |-- company_name: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- country: string (nullable = true)
 |-- genai_tool: string (nullable = true)
 |-- adoption_year: integer (nullable = true)
 |-- number_of_employees_impacted: integer (nullable = true)
 |-- new_roles_created: integer (nullable = true)
 |-- training_hours_provided: integer (nullable = true)
 |-- productivity_change_(%): float (nullable = true)
 |-- employee_sentiment: string (nullable = true)



####**Task 5:** Write a function that adds a new column adoption_level based on number_of_employees_impacted: High if >5000, medium if 1000â€“5000 and low if <1000.

In [17]:
def new_column_based_on_number_of_employees(df):
  from pyspark.sql.functions import when
  df = df.withColumn("adoption_level",
                     when(df["number_of_employees_impacted"] > 5000, "High")
                     .when((df["number_of_employees_impacted"] >= 1000) &
                           (df["number_of_employees_impacted"] <= 5000), "Medium")
                     .otherwise("Low"))
  return df

df_ai = new_column_based_on_number_of_employees(df_ai)
df_ai.printSchema()
df_ai.show(5)
#

root
 |-- company_name: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- country: string (nullable = true)
 |-- genai_tool: string (nullable = true)
 |-- adoption_year: integer (nullable = true)
 |-- number_of_employees_impacted: integer (nullable = true)
 |-- new_roles_created: integer (nullable = true)
 |-- training_hours_provided: integer (nullable = true)
 |-- productivity_change_(%): float (nullable = true)
 |-- employee_sentiment: string (nullable = true)
 |-- adoption_level: string (nullable = false)

+--------------------+-----------+------------+----------+-------------+----------------------------+-----------------+-----------------------+-----------------------+--------------------+--------------+
|        company_name|   industry|     country|genai_tool|adoption_year|number_of_employees_impacted|new_roles_created|training_hours_provided|productivity_change_(%)|  employee_sentiment|adoption_level|
+--------------------+-----------+------------+----------

####**Task 6:** Write a function that groups the dataset by country and industry, and returns the total number of companies, average productivity change and total new roles created.

In [21]:
def group_by_country_industry(df):
  import pyspark.sql.functions as F
  from pyspark.sql.functions import count, avg, sum
  df.groupBy("country", "industry").agg(
      count("company_name").alias("total_companies"),
      avg("productivity_change_(%)").alias("average_productivity_change"),
      sum("new_roles_created").alias("total_new_roles_created")).orderBy("country", "industry").show()

group_by_country_industry(df_ai)

+---------+--------------+---------------+---------------------------+-----------------------+
|  country|      industry|total_companies|average_productivity_change|total_new_roles_created|
+---------+--------------+---------------+---------------------------+-----------------------+
|Australia|   Advertising|            543|          17.95580110277699|                   8367|
|Australia|       Defense|            533|         18.787804839311352|                   8141|
|Australia|     Education|            495|         18.427474761250043|                   7513|
|Australia| Entertainment|            535|          18.77738320003046|                   8546|
|Australia|       Finance|            497|          18.90020121559051|                   7875|
|Australia|    Healthcare|            550|          18.73709089756012|                   8417|
|Australia|   Hospitality|            528|         18.695075743126147|                   8207|
|Australia|Legal Services|            528|        

#####**Task 7:** Write a function that preprocesses the employee_sentiment column by converting it to lowercase and removing punctuation.

In [27]:
import pyspark.sql.functions as F
def clean_employee_sentiment(df):
  from pyspark.sql.functions import lower, regexp_replace
  df = df.withColumn("employee_sentiment", lower(df["employee_sentiment"]))
  df = df.withColumn("employee_sentiment", regexp_replace(df["employee_sentiment"], "[^a-zA-Z0-9\\s]", ""))
  return df

df_ai = clean_employee_sentiment(df_ai)
df_ai.select(F.col('employee_sentiment')).show()

+--------------------+
|  employee_sentiment|
+--------------------+
|productivity incr...|
|we now finish tas...|
|productivity incr...|
|ai helped me redu...|
|job roles have sh...|
|new roles are exc...|
|job roles have sh...|
|collaboration imp...|
|theres concern th...|
|i love using aiit...|
|job roles have sh...|
|job roles have sh...|
|job roles have sh...|
|collaboration imp...|
|ai helped me redu...|
|collaboration imp...|
|we now finish tas...|
|theres concern th...|
|productivity incr...|
|theres concern th...|
+--------------------+
only showing top 20 rows


####**Task 8:** Write a function that returns a year-wise summary including number of companies, average training hours and the most adopted GenAI tool for each year.

In [28]:
def year_wise_summary(df):
  import pyspark.sql.functions as F
  from pyspark.sql.functions import count, avg
  df.groupBy('adoption_year').agg(
      count('company_name').alias('number_of_companies'),
      avg('training_hours_provided').alias('average_training_hours'),
      F.max('genai_tool').alias('most_adopted_genai_tool')).orderBy('adoption_year').show()

year_wise_summary(df_ai)

+-------------+-------------------+----------------------+-----------------------+
|adoption_year|number_of_companies|average_training_hours|most_adopted_genai_tool|
+-------------+-------------------+----------------------+-----------------------+
|         2022|              33180|    12797.869469559975|                Mixtral|
|         2023|              33344|    12717.287817898272|                Mixtral|
|         2024|              33476|    12712.635709164775|                Mixtral|
+-------------+-------------------+----------------------+-----------------------+



####**Task 9:** Write a function that returns a cleaned version of the dataset with standardised column names, missing values handled, adoption_level column added and employee_sentiment trimmed to 100 characters.

In [29]:
def clean_dataset(df):
  df = standardize_column_names(df)
  df = cast_adoptions_year_productivity_change_training_hours_number_of_employees(df)
  df = new_column_based_on_number_of_employees(df)
  df = clean_employee_sentiment(df)

  df = df.withColumn("employee_sentiment", F.substring(df["employee_sentiment"], 1, 100))
  return df

df_ai = clean_dataset(df_ai)
df_ai.printSchema()

root
 |-- company_name: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- country: string (nullable = true)
 |-- genai_tool: string (nullable = true)
 |-- adoption_year: integer (nullable = true)
 |-- number_of_employees_impacted: integer (nullable = true)
 |-- new_roles_created: integer (nullable = true)
 |-- training_hours_provided: integer (nullable = true)
 |-- productivity_change_(%): float (nullable = true)
 |-- employee_sentiment: string (nullable = true)
 |-- adoption_level: string (nullable = false)



####**Task 10:** Write a function that returns a star schema design with one fact table and three dimension tables: Company, time and genai_tool.

In [34]:
def star_schema_design(df):
  df_company = df.select("company_name", "country", "industry", "number_of_employees_impacted").distinct()
  df_time = df.select("adoption_year").distinct()
  df_genai_tool = df.select("genai_tool").distinct()
  # The fact table should include measures and foreign keys to dimensions
  df_fact_table = df.select(
      "company_name",
      "adoption_year",
      "genai_tool",
      "productivity_change_(%)",
      "adoption_level",
      "training_hours_provided",
      "new_roles_created",
      "employee_sentiment"
  )

  return df_company, df_time, df_genai_tool, df_fact_table

df_company, df_time, df_genai_tool, df_fact_table = star_schema_design(df_ai)
df_company.show()
df_time.show()
df_genai_tool.show()
df_fact_table.show()

+--------------------+-----------+-------------+----------------------------+
|        company_name|    country|     industry|number_of_employees_impacted|
+--------------------+-----------+-------------+----------------------------+
|Stafford-Collins ...|     France|      Finance|                        8236|
|Miranda and Sons ...|        UAE|      Telecom|                       17832|
|Williams Group Pv...|         UK|Manufacturing|                        6929|
|Byrd-Patrick Pvt....|      India|    Education|                        4937|
|Johnson, Ortiz an...|        USA|       Retail|                       17079|
|Spencer-Johnson P...|     Canada|       Retail|                         591|
|Robinson Ltd Pvt....|     France|Entertainment|                        4469|
|Hanson, Gilbert a...|  Australia|    Utilities|                        9547|
|Curry, Wilson and...|     Canada|   Healthcare|                         599|
|Gallagher, Brown ...|    Germany|      Telecom|                