In [0]:
import pandas as pd
import numpy as np
import os

In [0]:
%run ./utils/user_defined_functions

## Utils functions used in notebooks

Collecting Unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
Installing collected packages: Unidecode
Successfully installed Unidecode-1.4.0
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


## Read files gptw before 2022

Depending on the year of the survey, the files are read differently:

In [0]:
def read_single_file(path, filename, year):
    # extract country from file name
    country = filename.split('_')[0]  # Usa il primo elemento separato da _
    print(country)
    # KPI names
    kpi_names = ["Credibility", "Respect", "Fairness", "Pride", "Camaraderie", "Overall", "Campari Questions"]
    n_kpi = len(kpi_names)
    # parameters based on survey year
    if year == "2016":
        sheet = 3
        skip = 3
        skip_data = 3
        start_select = 'General Management'
        end_select = 'Assistants'
        statement_col = "Results for:"
        statement_gptw_overall = "X1: Taking everything into account, I would say this is a great place to work."
        overall_col = country
        if country == "United Kingdom": overall_col = "UK"
        if country == "United States": overall_col = "USA"
        nresp_col = "Number of answers"
        overall_val = kpi_names
        overall_col2search = "statement"
    elif year == "2018":
        sheet = 4
        if country in ["South Africa", "Peru", "France"]: sheet = 3
        skip = 1
        skip_data = 1
        start_select = 'Commercial: Customer Marketing'
        end_select = 'Global Supply Chain Functions'
        statement_col = "...3"
        statement_gptw_overall = "Taking everything into account, I would say this is a great place to work."
        overall_col = f"{country}.1"
        nresp_col = "Number Of Responses"
        overall_val = "Average"
        overall_col2search = "statement_idx"
    elif year == "2020":
        sheet = 6
        if country in ["Singapore", "Japan"]: sheet = 5
        skip = 1
        skip_data = 1
        start_select = 'Commercial: Customer Marketing'
        if country == 'Greece':
            start_select = "Commercial: Sales"
        elif country in ['Mexico', 'Jamaica', 'Supply Chain Europe', 'Supply Chain Asia Pacific', 'Supply Chain Americas']:
            start_select = 'Agriculture'
        end_select = 'Global Supply Chain Functions'
        statement_col = "...3"
        statement_gptw_overall = "Taking everything into account, I would say this is a great place to work."
        overall_col = f"Result {year}/{int(year[2:])+1}"
        nresp_col = "Number Of Responses"
        overall_val = "Average"
        overall_col2search = "statement_idx"

    try:
        data = pd.read_excel(path + filename, sheet_name=sheet, skiprows=skip_data, na_values=[" ", "-"])
        functions = pd.read_excel(path + filename, sheet_name=sheet, skiprows=skip, nrows=1)
        if year in ['2016', '2020']:
            functions = functions.loc[:, start_select: end_select].columns.tolist() 
            print(functions)
        elif year == '2018':
            start_select = functions.columns[5]
            print(start_select)
            functions = functions.loc[:, start_select:]  # take columns from start select
            print(functions)

            if end_select in functions.columns:
                functions = functions.loc[:, :end_select].columns.tolist()
                print(functions)
    except Exception as e:
        print(f"Import not completed for {country}")
        return pd.DataFrame() 

    try:
        if year == "2016":
        # If year 2016, drop the first column
            data = data.drop(columns=[data.columns[0]])
            # columns mapping
            column_mapping = {
                data.columns[0]: "kpi",  
                data.columns[1]: "statement_idx", 
                statement_col: "statement"
            }
        else:
            column_mapping = {
            data.columns[0]: "kpi",  
            data.columns[1]: "statement_idx", 
            data.columns[2]: "statement"
        }

        data["overall"] = data[overall_col]
        data = data.rename(columns=column_mapping)
        data = data.iloc[:, [0, 1, 2] + [data.columns.get_loc(col) for col in functions] + [data.columns.get_loc("overall")]]
        data = data.dropna(subset=["kpi", "statement_idx", "overall"], how="all")
        data["kpi"] = data["kpi"].fillna(method="ffill").fillna(method="bfill")

        data = data[
            (data["kpi"].isin(kpi_names)) &
            ((data[overall_col2search] == overall_val) if isinstance(overall_val, str) else data[overall_col2search].isin(overall_val)) |
            (data["statement"] == statement_gptw_overall) |
            (data["statement"] == nresp_col)
        ]

        data["kpi"] = data["kpi"].where(data["statement"] != nresp_col, "Responders")
        data["kpi"] = data["kpi"].where(data["statement"] != statement_gptw_overall, "Overall")

        data = data[["kpi"] + functions + ["overall"]]

        data.columns = data.columns.to_series().apply(simplify_string)

        cols_to_drop = [col for col in data.columns if data[col].isna().sum() in [n_kpi, n_kpi + 1]]
        data = data.drop(columns=cols_to_drop)

        data = data.melt(id_vars=["kpi"], var_name="job_function", value_name="score").sort_values(by="kpi")
        data['score'] = data['score'].astype(float)
        data['score'] = np.where(data['kpi'] != 'Responders', np.round(data['score'] * 100), data['score'])

        data['score'] = data['score'].apply(lambda x: int(x) if pd.notna(x) else pd.NA)

        data = data.pivot(index=["job_function"], columns="kpi", values="score").add_prefix("gptw_").reset_index()

        data["country"] = country
        data["year"] = int(year)

        data.columns = data.columns.to_series().apply(simplify_string)

        print(f"Import completed for {country}")
        return data

    except Exception as e:
        print(f"Error in processing file {filename} for {country}: {e}")
        return pd.DataFrame() 

In [0]:
## 2016 
path_2016 = f"/Volumes/dev_advanced_analytics_hr/default/input_files/hr_turnover/input/gptw_survey/2016/" 
file_names = [f.name for f in dbutils.fs.ls(path_2016) if f.name.endswith(".xlsx")] 
gptw2016 = pd.concat([read_single_file(path_2016, f, "2016") for f in file_names], ignore_index=True) 
gptw2016

Argentina
['General Management', 'Marketing/Communication', 'Sales', 'Trade Marketing', 'Engineering & Maintenance', 'Logistics & Customer Service', 'Operations/Production', 'Supply Planning', 'Procurement', 'Quality/R&D', 'Agriculture Jamaica ', 'Finance/Internal Audit', 'IT', 'HR', 'Legal', 'Other Supporting Functions', 'Assistants']
Import completed for Argentina
Australia
['General Management', 'Marketing/Communication', 'Sales', 'Trade Marketing', 'Engineering & Maintenance', 'Logistics & Customer Service', 'Operations/Production', 'Supply Planning', 'Procurement', 'Quality/R&D', 'Agriculture Jamaica ', 'Finance/Internal Audit', 'IT', 'HR', 'Legal', 'Other Supporting Functions', 'Assistants']
Import completed for Australia
Austria
['General Management', 'Marketing/Communication', 'Sales', 'Trade Marketing', 'Engineering & Maintenance', 'Logistics & Customer Service', 'Operations/Production', 'Supply Planning', 'Procurement', 'Quality/R&D', 'Agriculture Jamaica ', 'Finance/Internal

kpi,job_function,gptw_camaraderie,gptw_campari_questions,gptw_credibility,gptw_fairness,gptw_overall,gptw_pride,gptw_respect,gptw_responders,country,year
0,engineering_maintenance,84,71,67,67,90,91,65,10,Argentina,2016
1,finance_internal_audit,61,58,60,61,47,55,67,15,Argentina,2016
2,logistics_customer_service,79,80,77,72,88,88,81,17,Argentina,2016
3,marketing_communication,92,79,89,74,100,88,87,10,Argentina,2016
4,operations_production,77,74,70,63,77,79,65,39,Argentina,2016
...,...,...,...,...,...,...,...,...,...,...,...
82,marketing_communication,73,63,55,57,71,73,56,28,United States,2016
83,operations_production,55,48,37,49,57,56,40,82,United States,2016
84,overall,68,60,53,61,70,70,54,308,United States,2016
85,quality_r_d,76,71,73,64,90,76,64,10,United States,2016


In [0]:
gptw2016['country'].value_counts()

Italy             16
Jamaica           12
Brazil             9
Argentina          8
United States      7
Russia             6
Australia          4
Mexico             4
Canada             3
Germany            3
United Kingdom     3
Belgium            2
China              2
Spain              2
Switzerland        2
Ukraine            2
Greece             1
Austria            1
Name: country, dtype: int64

In [0]:
## 2018
path_2018 = f'/Volumes/dev_advanced_analytics_hr/default/input_files/hr_turnover/input/gptw_survey/2018/'
file_names_2018 = [f.name for f in dbutils.fs.ls(path_2018) if f.name.endswith(".xlsx")]
gptw2018 = pd.concat([read_single_file(path_2018, f, "2018") for f in file_names_2018], ignore_index=True)
gptw2018

Aggregated Asia
Import not completed for Aggregated Asia
Argentina
Commercial: Customer Marketing
   Commercial: Customer Marketing  ...  Blue Collars, Hourly, Task Workers
0                             NaN  ...                                 NaN

[1 rows x 34 columns]
['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Health, Safety and Environment', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Procurement', 'Product R&D', 'Quality', 'Supply Chain General', 'Global Supply Chain Functions']
Import completed for Argentina
Australia
Commercial: Customer Marketing
   Commercial: Customer Marketing  ...  Blue Collars, Hourly, Task Workers
0                             NaN  ...                                 NaN

[1 rows x 35 columns]
['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/In

kpi,job_function,gptw_camaraderie,gptw_campari_questions,gptw_credibility,gptw_fairness,gptw_overall,gptw_pride,gptw_respect,gptw_responders,country,year
0,commercial_customer_marketing,93,82,88,82,76,88,83,18,Argentina,2018
1,commercial_sales,84,77,77,70,75,81,78,20,Argentina,2018
2,finance_internal_audit,99,91,89,84,100,91,97,10,Argentina,2018
3,global_supply_chain_functions,89,84,76,71,89,91,79,57,Argentina,2018
4,logistics_customer_service,81,83,83,71,100,95,81,11,Argentina,2018
...,...,...,...,...,...,...,...,...,...,...,...
100,marketing_communication,76,68,68,66,81,77,68,43,United States,2018
101,other_supporting_functions_general_services_fa...,78,62,60,62,87,73,61,15,United States,2018
102,overall,71,64,61,65,74,73,62,383,United States,2018
103,planning,80,60,66,75,50,66,66,12,United States,2018


In [0]:
gptw2018['country'].value_counts()

Italy             15
Jamaica           12
United States     11
Argentina          7
Australia          7
Brazil             7
Russia             6
Mexico             6
Canada             5
United Kingdom     5
Germany            4
France             4
Greece             2
Peru               2
South Africa       2
Spain              2
Switzerland        2
Ukraine            2
Belgium            2
Austria            1
China              1
Name: country, dtype: int64

In [0]:
## 2020
path_2020 = f"/Volumes/dev_advanced_analytics_hr/default/input_files/hr_turnover/input/gptw_survey/2020/"
file_names_2020 = [f.name for f in dbutils.fs.ls(path_2020) if f.name.endswith(".xlsx")]
gptw2020 = pd.concat([read_single_file(path_2020, f, "2020") for f in file_names_2020], ignore_index=True)
gptw2020

Argentina
['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Health, Safety and Environment', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Procurement', 'Quality', 'Supply Chain General', 'Global Supply Chain Functions']
Import completed for Argentina
Australia


  warn(msg)


['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Health, Safety and Environment', 'IT', 'Legal & Business Development', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Procurement', 'Quality', 'Supply Chain General', 'Global Supply Chain Functions']
Import completed for Australia
Austria
['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'Logistics & Customer Service', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Global Supply Chain Functions']
Import completed for Austria
Belgium


  warn(msg)


['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'Logistics & Customer Service', 'Marketing/Communication', 'Global Supply Chain Functions']
Import completed for Belgium
Brazil
['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'HR', 'Health, Safety and Environment', 'IT', 'Legal & Business Development', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Procurement', 'Quality', 'Supply Chain General', 'Blue Collars, Hourly, Task Workers', 'Executive', 'Manager', 'Staff, Professional, White Collars', 'Global Supply Chain Functions']
Import completed for Brazil
Canada


  warn(msg)


['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Health, Safety and Environment', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Procurement', 'Product R&D', 'Quality', 'Supply Chain General', 'Global Supply Chain Functions']
Import completed for Canada
China


  warn(msg)


['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'Logistics & Customer Service', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Global Supply Chain Functions']
Import completed for China
France
['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Health, Safety and Environment', 'IT', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Product R&D', 'Quality', 'Supply Chain General', 'Global Supply Chain Functions']
Import completed for France
GSM
[]
Error in processing file GSM_Feedback Report_Global Camparista Survey 2020.xlsx for GSM: index 2 is out of bounds for axis 0 with size 2
Germany
['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', '

  warn(msg)


['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Health, Safety and Environment', 'IT', 'Legal & Business Development', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Procurement', 'Product R&D', 'Public Affairs', 'Quality', 'Supply Chain General', 'Global Supply Chain Functions']
Import completed for Italy
Jamaica
['Agriculture', 'Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Health, Safety and Environment', 'IT', 'Legal & Business Development', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Procurement', 'Product R&D', 'Public Affairs', 'Quality', 'Supply Chain General', 'Global Supply Chain Functions']
Impo

  warn(msg)


['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'IT', 'Legal & Business Development', 'Logistics & Customer Service', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Procurement', 'Supply Chain General', 'Global Supply Chain Functions']
Import completed for Russia
Singapore
['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'IT', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Executive', 'Manager', 'Staff, Professional, White Collars', 'Global Supply Chain Functions']
Import completed for Singapore
South East Asia & India
Import not completed for South East Asia & India
Spain


  warn(msg)


['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'IT', 'Logistics & Customer Service', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Global Supply Chain Functions']
Import completed for Spain
Supply Chain Americas
['Agriculture', 'Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Health, Safety and Environment', 'IT', 'Legal & Business Development', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Procurement', 'Product R&D', 'Public Affairs', 'Quality', 'Supply Chain General', 'Blue Collars, Hourly, Task Workers', 'Executive', 'Manager', 'Staff, Professional, White Collars', 'Global Supply Chain Functions']
Import completed for Supply Chain Americas
Supply Chain Asia Pacific
[

  warn(msg)


['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Logistics & Customer Service', 'Marketing/Communication', 'Global Supply Chain Functions']
Import completed for Switzerland
Ukraine


  warn(msg)


['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'Legal & Business Development', 'Logistics & Customer Service', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Supply Chain General', 'Global Supply Chain Functions']
Import completed for Ukraine
United Kingdom


  warn(msg)


['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Health, Safety and Environment', 'IT', 'Legal & Business Development', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Quality', 'Global Supply Chain Functions']
Import completed for United Kingdom
United States
['Commercial: Customer Marketing', 'Commercial: Sales', 'Finance/Internal Audit', 'General Management/PMO', 'HR', 'Health, Safety and Environment', 'IT', 'Legal & Business Development', 'Logistics & Customer Service', 'Manufacturing & Engineering', 'Marketing/Communication', 'Other Supporting Functions (General Services/Facility/Assistants)', 'Planning', 'Procurement', 'Product R&D', 'Public Affairs', 'Quality', 'Supply Chain General', 'Blue Collars, Hourly, Task Workers', 'Executive', 'Manager', 'Staff, Professional, White Collars', 'Global S

kpi,job_function,gptw_camaraderie,gptw_credibility,gptw_fairness,gptw_overall,gptw_pride,gptw_respect,gptw_responders,country,year
0,commercial_customer_marketing,96,73,70,94,87,79,16,Argentina,2020
1,commercial_sales,95,90,84,89,94,92,18,Argentina,2020
2,finance_internal_audit,94,90,78,100,89,90,12,Argentina,2020
3,global_supply_chain_functions,80,69,62,69,78,68,61,Argentina,2020
4,logistics_customer_service,83,66,59,73,86,68,15,Argentina,2020
...,...,...,...,...,...,...,...,...,...,...
150,other_supporting_functions_general_services_fa...,62,55,53,65,69,52,17,United States,2020
151,overall,74,66,66,79,76,66,390,United States,2020
152,planning,63,64,70,64,55,66,11,United States,2020
153,quality,75,72,64,92,73,72,12,United States,2020


Asia data is managed separately:

In [0]:
asia2018 = pd.read_excel(f"{path_2018}{file_names_2018[0]}", sheet_name= 2, skiprows = 1)
asia2018

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Aggregated Asia,Aggregated Asia.1,Campari Group 2018,Difference
0,,,,Results vs External Benchmarks,,,
1,,,,Overall Response Rate: 100%,,,
2,,,Number Of Responses,,14.000000,2870.000000,-
3,Credibility,1,Management keeps me informed about important i...,,0.785714,0.580182,0.205532
4,,2,Management makes its expectations clear.,,1.000000,0.622727,0.377273
...,...,...,...,...,...,...,...
85,,,,,,,
86,,,,,,,
87,,,,,,,
88,,,,,,,


In [0]:
country = re.match(r"[^_]+", file_names_2018[0]).group(0)
country

'Aggregated Asia'

In [0]:
overall_col = f"{country}.1"
column_mapping = {
    asia2018.columns[0]: "kpi",  
    asia2018.columns[1]: "statement_idx", 
   asia2018.columns[2]: "statement"  # Usare la variabile già definita statement_col
}

# Rinomina le colonne
asia2018["overall"] = asia2018[overall_col]
asia2018 = asia2018.rename(columns=column_mapping)

In [0]:
asia2018 = asia2018.iloc[:, [0, 1, 2] + [asia2018.columns.get_loc("overall")]]
asia2018 = asia2018.dropna(subset=["kpi", "statement_idx", "overall"], how="all")
asia2018["kpi"] = asia2018["kpi"].fillna(method="ffill").fillna(method="bfill")
asia2018


Unnamed: 0,kpi,statement_idx,statement,overall
2,Credibility,,Number Of Responses,14.000000
3,Credibility,1,Management keeps me informed about important i...,0.785714
4,Credibility,2,Management makes its expectations clear.,1.000000
5,Credibility,3,I can ask management any reasonable question a...,0.785714
6,Credibility,4,"Management is approachable, easy to talk with.",0.714286
...,...,...,...,...
81,Campari Questions,72,People around me are honest and ethical in the...,0.928571
82,Campari Questions,73,Concrete improvement plans have been implement...,0.461538
83,Campari Questions,74,Campari Group is investing in new tools/system...,0.500000
84,Campari Questions,Average,,0.736950


In [0]:
data = asia2018[["kpi"] + ["overall"]]

In [0]:
kpi_names = ["Credibility", "Respect", "Fairness", "Pride", "Camaraderie", "Overall", "Campari Questions"]
overall_col = f"{country}.1"
nresp_col = "Number Of Responses"
overall_val = "Average"
overall_col2search = "statement_idx"
statement_gptw_overall = "Taking everything into account, I would say this is a great place to work."

In [0]:
asia2018 = asia2018[
    (asia2018["kpi"].isin(kpi_names)) &
    ((asia2018[overall_col2search] == overall_val) if isinstance(overall_val, str) else data[overall_col2search].isin(overall_val)) |
    (asia2018["statement"] == statement_gptw_overall) |
    (asia2018["statement"] == nresp_col)
]

In [0]:
asia2018.loc[asia2018["statement"] == nresp_col, "kpi"] = "Responders"
asia2018.loc[asia2018["statement"] == statement_gptw_overall, "kpi"] = "Overall"
asia2018

Unnamed: 0,kpi,statement_idx,statement,overall
2,Responders,,Number Of Responses,14.0
17,Credibility,Average,,0.882653
31,Respect,Average,,0.730769
44,Fairness,Average,,0.851191
53,Pride,Average,,0.848214
64,Camaraderie,Average,,0.814286
65,Overall,58,"Taking everything into account, I would say th...",0.857143
84,Campari Questions,Average,,0.73695


In [0]:
data_long = asia2018.melt(id_vars=["kpi"], value_vars=['overall'], var_name="job_function", value_name="score")
data_long

Unnamed: 0,kpi,job_function,score
0,Responders,overall,14.0
1,Credibility,overall,0.882653
2,Respect,overall,0.730769
3,Fairness,overall,0.851191
4,Pride,overall,0.848214
5,Camaraderie,overall,0.814286
6,Overall,overall,0.857143
7,Campari Questions,overall,0.73695


In [0]:
data_long['score'] = np.where(data_long['kpi'] != 'Responders', np.round(data_long['score'] * 100), data_long['score'])

data_long['score'] = data_long['score'].apply(lambda x: int(x) if pd.notna(x) else pd.NA)

In [0]:
data = data_long.pivot(index=["job_function"], columns="kpi", values="score").add_prefix("gptw_").reset_index()

data["country"] = country
data["year"] = 2018

In [0]:
data.columns = data.columns.to_series().apply(simplify_string)

In [0]:
data['country'] = data['country'].replace({'Aggregated Asia': 'Asia'})

In [0]:
data

kpi,job_function,gptw_camaraderie,gptw_campari_questions,gptw_credibility,gptw_fairness,gptw_overall,gptw_pride,gptw_respect,gptw_responders,country,year
0,overall,81,74,88,85,86,85,73,14,Asia,2018


In [0]:
asia2020 = pd.read_excel(f"{path_2020}{file_names_2020[22]}", sheet_name= 1, skiprows = 1)
asia2020

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,South East Asia & India,Result 2020/21,Campari 2020,Difference Overall Result - Benchmark,Bmk FMCG,Difference Overall Result - Benchmark.1,25 World's Best 2020,Difference Overall Result - Benchmark.2
0,,,,Overall Overview,,,,,,,
1,,,,Overall Response Rate: 92%,,,,,,,
2,,,Number Of Responses,,11,3196.000000,-,-,-,-,-
3,Credibility,1,Management keeps me informed about important i...,,0.6,0.644836,-0.044836,0.682895,-0.082895,0.820576,-0.220576
4,,2,Management makes its expectations clear.,,0.636364,0.689786,-0.053422,0.713716,-0.077352,0.819076,-0.182712
...,...,...,...,...,...,...,...,...,...,...,...
83,,74,Campari Group participates in campaigns/initia...,,0.727273,0.753142,-0.025869,-,-,-,-
84,,75,People around me are honest and ethical in the...,,1,0.775775,0.224225,-,-,-,-
85,,76,Campari Group is investing in new tools/system...,,0.727273,0.764537,-0.037264,-,-,-,-
86,,,,,,,,,,,


In [0]:
country = re.match(r"[^_]+", file_names_2020[22]).group(0)
country

'South East Asia & India'

In [0]:
overall_col = "Result 2020/21"
column_mapping = {
    asia2020.columns[0]: "kpi",  
    asia2020.columns[1]: "statement_idx", 
   asia2020.columns[2]: "statement"  # Usare la variabile già definita statement_col
}

# Rinomina le colonne
asia2020["overall"] = asia2020[overall_col]
asia2020 = asia2020.rename(columns=column_mapping)

In [0]:
asia2020 = asia2020.iloc[:, [0, 1, 2] + [asia2020.columns.get_loc("overall")]]
asia2020 = asia2020.dropna(subset=["kpi", "statement_idx", "overall"], how="all")
asia2020["kpi"] = asia2020["kpi"].fillna(method="ffill").fillna(method="bfill")
asia2020 = asia2020[
    (asia2020["kpi"].isin(kpi_names)) &
    ((asia2020[overall_col2search] == overall_val) if isinstance(overall_val, str) else asia2020[overall_col2search].isin(overall_val)) |
    (asia2020["statement"] == nresp_col)

]

In [0]:
asia2020["kpi"] = np.where(asia2020["statement_idx"].isna(), "Responders", asia2020["kpi"])
asia2020['overall'] = asia2020['overall'].astype(float)
asia2020['overall'] = np.where(asia2020['kpi'] != 'Responders', np.round(asia2020['overall'] * 100), asia2020['overall'])

asia2020['overall'] = asia2020['overall'].apply(lambda x: int(x) if pd.notna(x) else pd.NA)

In [0]:
asia2020

Unnamed: 0,kpi,statement_idx,statement,overall
2,Responders,,Number Of Responses,11
18,Credibility,Average,,71
33,Respect,Average,,71
46,Fairness,Average,,81
58,Pride,Average,,79
66,Camaraderie,Average,,64


In [0]:
asia2020 = asia2020.iloc[:,[asia2020.columns.get_loc("kpi")]+ [asia2020.columns.get_loc("overall")]]
asia2020["job_function"] = "overall"
asia2020 = asia2020.pivot(index=['job_function'], columns="kpi", values="overall").add_prefix("gptw_").reset_index()
asia2020["country"] = country
asia2020["year"] = 2020
asia2020.columns = asia2020.columns.to_series().apply(simplify_string)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asia2020["job_function"] = "overall"


In [0]:
asia2020

kpi,job_function,gptw_camaraderie,gptw_credibility,gptw_fairness,gptw_pride,gptw_respect,gptw_responders,country,year
0,overall,64,71,81,79,71,11,South East Asia & India,2020


In [0]:
asia2020['country'] = asia2020['country'].replace({'South East Asia & India': 'south_east_asia'})

In [0]:
df_combined_2018 = pd.concat([gptw2018, data], ignore_index=True)
df_combined_2020 = pd.concat([gptw2020, asia2020], ignore_index=True)
df_final = pd.concat([gptw2016, df_combined_2018], ignore_index=True)
df_final = df_final.drop(columns=['gptw_campari_questions'], errors='ignore')
df_final = pd.concat([df_final, df_combined_2020], ignore_index=True)
df_final


kpi,job_function,gptw_camaraderie,gptw_credibility,gptw_fairness,gptw_overall,gptw_pride,gptw_respect,gptw_responders,country,year
0,engineering_maintenance,84,67,67,90,91,65,10,Argentina,2016
1,finance_internal_audit,61,60,61,47,55,67,15,Argentina,2016
2,logistics_customer_service,79,77,72,88,88,81,17,Argentina,2016
3,marketing_communication,92,89,74,100,88,87,10,Argentina,2016
4,operations_production,77,70,63,77,79,65,39,Argentina,2016
...,...,...,...,...,...,...,...,...,...,...
344,overall,74,66,66,79,76,66,390,United States,2020
345,planning,63,64,70,64,55,66,11,United States,2020
346,quality,75,72,64,92,73,72,12,United States,2020
347,staff_professional_white_collars,79,74,71,86,80,73,226,United States,2020


In [0]:
df_final['gptw_credibility'].isna().value_counts()

False    349
Name: gptw_credibility, dtype: int64

In [0]:
df_final['country'].value_counts()

Italy                        47
Jamaica                      37
United States                33
Brazil                       26
Argentina                    22
Russia                       18
Australia                    18
Mexico                       17
Supply Chain Americas        15
United Kingdom               14
Canada                       13
Germany                      11
France                       11
Supply Chain Europe           9
Spain                         7
Supply Chain Asia Pacific     6
Switzerland                   6
Ukraine                       6
Greece                        6
Belgium                       6
Peru                          5
China                         4
Singapore                     3
Austria                       3
South Africa                  2
Japan                         2
Asia                          1
south_east_asia               1
Name: country, dtype: int64

In [0]:
df_final.to_csv(f"/Volumes/dev_advanced_analytics_hr/default/input_files/hr_turnover/data_cleaning//data_gptw_2016-2020_v06.csv", sep=";", index=False)