In [1]:
import pandas as pd
import pathlib
import seaborn as sns

from matplotlib import pyplot as plt
from pprint import pprint

from settings import (BASE_CSV_PATH, QUALTRICS_DROP_COLS, SKIP_COLS, 
                      MULTISELECT_COLS, AGREE_DISAGREE_COLS, IMPORTANCE_COLS, 
                      STANDARD_DUMMY_COLS, UTILITY_SCORE_COLS)

In [2]:
base_df = pd.read_csv(BASE_CSV_PATH)
base_df.shape

(675, 139)

Only 675 responses, I assume?

## Pre-processing functions

In [3]:
def drop_utility_question_columns(in_df):
    out_df = in_df.copy()
    util_cols = [c for c in in_df.columns if "(This leaves 1 choice" in c]
    return out_df.drop(labels = util_cols, axis = 1)

In [4]:
def create_multinational_column(in_df, op_column = "Where does your organization have offices", 
                                null_fill = ""):
    out_df = in_df.copy()
    out_df["is_multinational"] = out_df[op_column].fillna(null_fill)
    out_df["is_multinational"] = out_df.is_multinational.str.contains("Multi")
    out_df["is_multinational"] = out_df.is_multinational.astype(int)
    return out_df.drop(labels = op_column, axis = 1)

In [5]:
def create_executive_column(in_df, op_column = "What is your seniority level?", 
                                null_fill = ""):
    out_df = in_df.copy()
    out_df["is_executive"] = out_df[op_column].fillna(null_fill)
    out_df["is_executive"] = out_df.is_executive.str.contains("Exec")
    out_df["is_executive"] = out_df.is_executive.astype(int)
    return out_df.drop(labels = op_column, axis = 1)

In [6]:
def create_male_column(in_df, op_column = "What best describes your gender?", 
                                null_fill = ""):
    out_df = in_df.copy()
    out_df["is_male"] = out_df[op_column].fillna(null_fill)
    out_df["is_male"] = out_df.is_male.str.contains("Male")
    out_df["is_male"] = out_df.is_executive.astype(int)
    return out_df.drop(labels = op_column, axis = 1)

In [7]:
def add_cluster_columns(in_df, op_column = "labels"):
    out_df = in_df.copy()
    out_df["clust_zeroAndOne"] = [1 if v in [0,1] else 0 for v in in_df[op_column]]
    out_df["clust_eight"] = [1 if v == 8 else 0 for v in in_df[op_column]]
    out_df["clust_sixAndNine"] = [1 if v in [6,9] else 0 for v in in_df[op_column]]
    return out_df.drop(labels = op_column, axis = 1)

In [8]:
def add_dummmies_to_df(in_df, col_to_code, coding_prefix, prefix_sep = "_"):
    ''' Will work for naively dropping levels where there's no specific meaning to
    the effect and it's just belonging to a category
    
    I may need to handle the 'agree - disagree' columns separately...
    '''
    level_count = in_df[col_to_code].nunique()
    if level_count < 3:
        print(f"\tColumn {col_to_code} only has {level_count} levels...")
        print(f"\tReturning unmodified df")
        return in_df
    dummy_df = pd.get_dummies(data = in_df[col_to_code], 
                              prefix = coding_prefix, 
                              prefix_sep = prefix_sep)
    dummy_cols = dummy_df.columns.tolist()
    drop_col_index = dummy_df.sum().argmin()
    drop_col = dummy_cols[drop_col_index]
    print(f"\tRemoving level {drop_col} from coding for {col_to_code}")
    dummy_df.drop(labels = drop_col, axis = 1, inplace = True)
    out_df = in_df.join(dummy_df, how = "inner")
    return out_df.drop(labels = col_to_code, axis = 1)

## Preprocessing Steps

**Remove columns that are component ingredients for the overall utility scores:** Any column with `(This leaves 1 choice` can be dropped off, just because these are the component ingredients for the overall utilities.

In [9]:
reg_df = drop_utility_question_columns(in_df = base_df)

In [10]:
reg_df.shape

(675, 96)

**Remove columns that you haven't figured out how to fully deal with yet**

In [11]:
for column_group in [QUALTRICS_DROP_COLS, SKIP_COLS, MULTISELECT_COLS, AGREE_DISAGREE_COLS, IMPORTANCE_COLS]:
    reg_df.drop(labels = column_group, axis = 1, inplace = True)

In [12]:
reg_df.shape

(675, 27)

**Use the functions above to create special columns for multinational, executive, and gender identity of response...**

In [13]:
reg_df = create_multinational_column(in_df = reg_df)
reg_df = create_executive_column(in_df = reg_df)
reg_df = create_male_column(in_df = reg_df)

In [14]:
reg_df.shape

(675, 27)

**Create dummy columns in your standardized way, only keeping an effect for the least frequent response of each type**

In [15]:
for code_column, prefix in STANDARD_DUMMY_COLS.items():
    reg_df = add_dummmies_to_df(in_df = reg_df, 
                                col_to_code = code_column,
                                coding_prefix = prefix)

	Removing level empFunc_Design from coding for Which business function best describes your core responsibility?
	Removing level bizSize_Sole proprietor from coding for What best describes your company size?
	Removing level buildCapability_I’m responsible for training people on a new software from coding for What best describes your role in building capabilities within your organization?  Select the most relevant.
	Removing level bizStrategy_Other from coding for Which of the following statements best describes your company’s strategy today?
	Removing level budgetCategory_Other from coding for What type of budget might you use for a solution similar to that of the concept "Learn While Doing?" - Selected Choice
	Removing level purchaseRole_I’d approve the purchase for my entire organization from coding for What would best describe your involvement in the purchase process of a solution similar to that of the concept "Learn While Doing?"
	Removing level empAge_18-21 from coding for What is

In [16]:
reg_df.shape

(675, 80)

**Create custom columns for the clusters that Nathan and Isabella specified:** this uses and then gets rid of the `label` column

In [17]:
reg_df = add_cluster_columns(in_df = reg_df)

In [18]:
reg_df.shape

(675, 82)

**Finally, drop out people with a `fit_statistic` lower than 0.39:** Apparently this is what sawtooth recommends when looking at utilities. I also think that Alla may have already taken this step when creating this file...

In [21]:
reg_df = reg_df.loc[(reg_df["Fit Statistic"] > 0.39)]
reg_df.drop(labels = "Fit Statistic", axis = 1, inplace = True)
reg_df.shape

(675, 81)

**OK,** so the only extraneous columns at this point are the value prop columns related to the target.

In [22]:
pprint(reg_df.columns.tolist())

["Equip your workforce with modern skills while solving your organization's "
 'top strategic challenges',
 'Build new capabilities while doing the work that matters to your team',
 "Hone your team's ability to continually experiment and build on the "
 'resulting knowledge',
 'Equip teams with the tools and mindset to solve problems independently',
 'Stay ahead of industry disruption by fostering an agile, resilient '
 'organizational culture',
 'Lead your organization through a digital transformation',
 'Cultivate shared processes and mindsets so your team can achieve better '
 'results',
 'Develop customer-centered skills to repeatedly build products, services, and '
 'experiences that appeal to your target audience',
 'Help employees develop customer-centered skills while working towards '
 'company strategy',
 'Scale new processes and methodologies across your organization with tools '
 'every team will find valuable',
 'is_multinational',
 'is_executive',
 'is_male',
 'empFunc_En

## Making a Target Variable

Do other aggregate scores exhibit enough variance to be an interesting regressor?

In [None]:
base_df[utility_score_cols].min(axis = 1).describe()

In [None]:
base_df[utility_score_cols].median(axis = 1).describe()

In [None]:
base_df[utility_score_cols].max(axis = 1).describe()

In [None]:
base_df[utility_score_cols].mean(axis=1).describe()

In [None]:
base_df["vp_max_utility"] = base_df[utility_score_cols].idxmax(axis = 1)
base_df["vp_max_utility"].value_counts()

In [None]:
base_df["Unnamed: 0"].head(5)