# Imports

In [30]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from simple_salesforce import Salesforce
from sf_queries_class import SfQueries
import my_sf_secrets
import glob
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta
import calendar
import create_capbase
from reportforce import Reportforce
from lifelines import KaplanMeierFitter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

username = os.getlogin()

path_to_planning_teams_folder = f"C:/Users/{username}/Q-Centrix, LLC/Senior Manager Projects - Consolidated Planning Tool/" 
path_to_ops_teams_folder = f"C:/Users/{username}/Q-Centrix, LLC/Senior Manager Projects - Ops Plan/"
path_to_staffing_models_folder = f"C:/Users/{username}/Q-Centrix, LLC/WFM QCI Centralization - Staffing Models/"

my_sf_username, my_sf_password, my_sf_security_token = my_sf_secrets.get_my_sf_secrets()
queries = SfQueries(
    username=my_sf_username,
    password=my_sf_password,
    security_token=my_sf_security_token
)

rf = Reportforce(session_id=queries.sf.session_id, instance_url=queries.sf.sf_instance)

In [2]:
win_rate_csv = False
today = (datetime.today() - relativedelta(days=0)).strftime('%B %d %Y').replace(" 0", " ")

In [37]:
def last_day_of_month(date_input, year=datetime.now().year):
    """
    Returns the last day of the month for the given date input.

    Parameters:
    date_input (str): A string representing a month name, month number, or a date.

    Returns:
    str: The last day of the month in YYYY-MM-DD format.
    """
    # if the input is a number, convert it to a string
    if (isinstance(date_input, int)):
        # convert it to a string
        date_input = str(date_input)
    try:
        # Try to parse the input as a date
        parsed_date = datetime.strptime(date_input, '%Y-%m-%d')
    except ValueError:
        # If parsing fails, try to interpret the input as a month name or number
        try:
            # Attempt to parse as a full month name, defaulting to the current year
            parsed_date = datetime.strptime(date_input + ' ' + str(year), '%B %Y')
        except ValueError:
            try:
                # Attempt to parse as an abbreviated month name, defaulting to the current year
                parsed_date = datetime.strptime(date_input + ' ' + str(year), '%b %Y')
            except ValueError:
                # If still failing, treat the input as a month number
                month_number = int(date_input)
                if 1 <= month_number <= 12:
                    # Default to the current year if no year is specified
                    parsed_date = datetime(year, month_number, 1)
                else:
                    raise ValueError(f"Invalid month number: {date_input}")

    # Calculate the last day of the month
    _, last_day = calendar.monthrange(parsed_date.year, parsed_date.month)
    # if first_day == 0:
    #     first_day = 1
    last_day_str = datetime(parsed_date.year, parsed_date.month, last_day)#.strftime('%B %d %Y').replace(" 0", " ")
    # first_day_str = datetime(parsed_date.year, parsed_date.month, 1)
    return last_day_str

# create the target column which is any negative Need rounded to the nearest .5 or whole number
def custom_round(value):
    """
    Rounds a value to the nearest 0.5 or whole number based on its proximity to 0.5 or 1.0.
    
    Parameters:
    - value: float, the value to round
    
    Returns:
    - Rounded value as a float
    """
    int_value = int(value)
    abs_value = abs(value - int_value)
    if abs_value >= 0.5 and abs_value < 1.0:
        return np.round(value)
    elif value < -0.1:
        return int_value - 0.5
    else:
        return 0
    
def final_column_adj_fx(df):
    # fill na with 0 in 'Pipe Raw, Pipeline Adjusted, Attrit, [n]m, 30-days Attrit, [n]m' columns
    df.loc[:, 'Pipe Raw'] = df['Pipe Raw'].fillna(0)
    df.loc[:, 'Pipeline Adjusted'] = df['Pipeline Adjusted'].fillna(0)
    df.loc[:, 'Attrit, [n]m'] = df['Attrit, [n]m'].fillna(0)
    df.loc[:, '30-days Attrit, [n]m'] = df['30-days Attrit, [n]m'].fillna(0)
    df.loc[:, 'Demand_FTE'] = df['Demand_FTE'].fillna(0)
    df.loc[:, 'Capacity_FTE'] = df['Capacity_FTE'].fillna(0)

    # make columns negative
    df.loc[:, 'Pipe Raw'] = df['Pipe Raw'] * -1
    df.loc[:, 'Pipeline Adjusted'] = df['Pipeline Adjusted'] * -1
    df.loc[:, 'Demand_FTE'] = df['Demand_FTE'] * -1
    df.loc[:, 'Attrit, [n]m'] = df['Attrit, [n]m'] * -1
    df.loc[:, '30-days Attrit, [n]m'] = df['30-days Attrit, [n]m'] * -1

    df.loc[:, 'Need'] = df['Demand_FTE'] + \
    df['Capacity_FTE'] + df['Pipeline Adjusted'] + \
        df['Attrit, [n]m']

    df.loc[:, 'Target'] = df['Need'].apply(custom_round) * -1
    df.loc[:, 'Target'] = np.where(
        df['Target'] < 0, 0, df['Target'])

    # round all columns to 3 decimal places except the Target column
    df = df.round({'Pipe Raw': 3, 'Win Rate': 3, 'Pipeline Adjusted': 3, 'Need': 3,
                   'Attrit, [n]m': 3, '30-days Attrit, [n]m': 3, 'age_closure_prob': 3,
                   'Predicted Win Probability': 3})
    return df
def prod_with_nan_as_zero(x):
    return np.prod(np.where(np.isnan(x), 0, x))

def create_monthly_snapshots(opportunities_df, start_date, end_date):
    """
    Create monthly snapshots of contracted and implemented hours by category.
    
    Parameters:
    opportunities_df: DataFrame with columns ['Category', 'Start_Date', 'End_Date', 
                     'Contracted_Hours', 'Implemented_Weekly_Hours', 'Stage']
    start_date: datetime object for the start of the analysis period
    end_date: datetime object for the end of the analysis period
    """
    
    # Create a date range for all months in the analysis period
    months = pd.date_range(start=start_date, end=end_date, freq='MS')
    
    snapshots = []
    
    for month_start in months:
        month_end = month_start + pd.offsets.MonthEnd(0)
        
        # Filter opportunities that are active in the current month
        active_mask = (
            (opportunities_df['Opportunity Start Date'] < month_start) & 
            (opportunities_df['Close Date'] >= month_end)
        )
        active_opportunities = opportunities_df[active_mask]
        
        # Calculate contracted hours by category
        contracted_hours = (
            active_opportunities
            .groupby('Category')['Contracted Weekly Hours']
            .sum()
            .reset_index()
        )
        
        # Calculate implemented hours only for opportunities with 'Contract Signed (Implementing)' stage
        implementing_opportunities = opportunities_df[ 
            ((opportunities_df['Stage'] == 'Contract Signed (Implementing)') | (opportunities_df['Stage'] == 'Customer Invoiced')) &
            (
            (opportunities_df['Close Date'] >= month_start) &
            (opportunities_df['Close Date'] <= month_end)
        )
        ].copy()
        
        implemented_hours = (
            implementing_opportunities
            .groupby('Category')['Implemented Weekly Hours']
            .sum()
            .reset_index()
        )
        
        # Merge the results
        month_snapshot = pd.merge(
            contracted_hours,
            implemented_hours,
            on='Category',
            how='outer'
        ).fillna(0)
        
        # Add month information
        month_snapshot['Month'] = month_start.strftime('%Y-%m')
        
        snapshots.append(month_snapshot)
    
    # Combine all monthly snapshots
    final_df = pd.concat(snapshots, ignore_index=True)
    
    # Reorder columns
    final_df = final_df[['Month', 'Category', 'Contracted Weekly Hours', 'Implemented Weekly Hours']]
    
    return final_df

# Current Pipeline

In [4]:
if 'ofph' not in globals():
    print("You need OFPH")
    ofph = rf.get_report("00Oan0000038NlhEAE", id_column='Opportunity ID') #'Opp+Cat'
len(ofph)

You need OFPH


14451

In [5]:
ofph.loc[:, 'Days to Close'] = (ofph['Close Date'] - datetime.now()).dt.days
ofph_fil_1 = ofph[(ofph['Services ACV above $250k'] == 'false') & 
                         (ofph['Probability (%)'] < 90) & 
                         (ofph['Probability (%)'] >= 5) & 
                         (~ofph['Stage'].isin(['Closed Lost', 'Contract Signed (Implementing)'])) & 
                         (ofph['Days to Close'] <= 90) &
                         (ofph['Opportunity Owner'] != 'Michelle Galvan') &
                         (ofph['Exclude from Resource Requests'] == 'false')].copy().reset_index()
ofph_fil = ofph_fil_1[ofph_fil_1['Category'] != 'Technology'].copy()

In [6]:
ofph_fil.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1243 entries, 0 to 1331
Data columns (total 27 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   Opportunity Name                       1243 non-null   object        
 1   Category                               1243 non-null   object        
 2   Opportunity Owner                      1243 non-null   object        
 3   Type                                   1243 non-null   object        
 4   Additional Information for PM Handoff  1243 non-null   object        
 5   Probability (%)                        1243 non-null   int64         
 6   Case Type                              1243 non-null   object        
 7   Contracted Weekly Hours                1243 non-null   float64       
 8   Monthly Quantity                       1243 non-null   float64       
 9   Backlog                                1243 non-null   object       

In [7]:
ofph_fil.loc[:, 'Category'] = np.where((ofph_fil['Category'] == 'Cancer') & 
                                                ((ofph_fil['Case Type'].str.contains('Follow up')) |
                    (ofph_fil['Case Type'].str.contains('Case Finding'))), 'Cancer - CDSS', 
                    np.where((ofph_fil['Category'] == 'Cancer') & 
                                                (~(ofph_fil['Case Type'].str.contains('Follow up')) &
                    ~(ofph_fil['Case Type'].str.contains('Case Finding'))), 'Cancer - Abstractor', ofph_fil['Category']))
ofph_fil.loc[:, 'FTE'] = ofph_fil['Contracted Weekly Hours'] / 40
ofph_gr = ofph_fil.loc[:, ['Category', 'FTE']].groupby('Category').sum()[['FTE']].reset_index().rename(
    columns={'FTE': 'Pipe Raw', 'Category': 'Product Category'})

# Supply Demand Summary Pull

In [8]:
cats = ['ACD CP', 'ACD HF', 'Advisory - Spark', 'Advisory - Validation', 'AFib Ablation', 'AJRR', 
        'Burn-ABA', 'Cancer - Abstractor', 'Cancer - CDSS', 'Chest Pain MI', 'CM', 'CMQCC', 'Concurrent', 'CPQCC', 
        'ELSO Registry', 'Other', 'GIQuIC', 'GWTG AFIB', 'GWTG Cardiogenic Shock', 'GWTG HF', 
        'GWTG Resuscitation', 'GWTG Stroke', 'GWTG-CAD', 'ICD', 'Infection Prevention', 'Intermacs', 
        'LAAO', 'MBSAQIP', 'MH', 'Mortality Review', 'NSQIP', 'NYS Sepsis', 'PCI', 'Registry', 
        'Registry Capture', 'Other', 'STEMI', 'STS-ACS', 'STS-CHS', 'STS-GTS', 'Trauma', 'TVT', 'VON', 'VQI']
cats_set = list(set(cats + list(ofph_fil.Category.unique())))
if 'Cancer' in cats_set:
        cats_set.pop(cats_set.index('Cancer'))
cats_set = sorted(cats_set)
all_product_categories_sel = pd.DataFrame({'Product Category': cats_set})

In [9]:
all_files = glob.glob(f"{path_to_planning_teams_folder}Supply Demand Summary/supply_demand_analysis_*.xlsx")
supply_demand_analysis_path = max(all_files, key=os.path.getctime)
supply_demand_analysis_file = pd.read_excel(supply_demand_analysis_path, sheet_name="Summary", skiprows=1)
# remove row 0 and rename the `Unnamed: 0` column to `Product Category`
supply_demand_analysis_file = supply_demand_analysis_file.iloc[1:].rename(columns={"Unnamed: 0": "Product Category"})
supply_demand = supply_demand_analysis_file.loc[:, ["Product Category", "Requested FTE", "FTE capacity"]]
# # if the category contains "cancer" then replace with "Cancer"
# supply_demand["Product Category"] = supply_demand["Product Category"].apply(lambda x: "Cancer" if "Cancer" in x else x)
# # combine the 2 rows in supply_demand_analysis_file that contain "Cancer"
# supply_demand = supply_demand.groupby("Product Category").sum().reset_index()
# join with all_product_categories
supply_demand_all_products = pd.merge(all_product_categories_sel, supply_demand, on="Product Category", 
                                      how="left")
# rename the Requested FTE and FTE capacity columns to Demand_FTE and Capacity_FTE
supply_demand_all_products = supply_demand_all_products.rename(columns={"Requested FTE": "Demand_FTE", 
                                                                        "FTE capacity": "Capacity_FTE"})
# merge with ops_plan_ofph_gr
supply_demand_all_products_pipe_raw = pd.merge(supply_demand_all_products, ofph_gr, on="Product Category", how="left")
supply_demand_all_products_pipe_raw.head()

Unnamed: 0,Product Category,Demand_FTE,Capacity_FTE,Pipe Raw
0,ACD CP,0.3,,
1,ACD HF,0.03,0.28,
2,AFib Ablation,,,0.241725
3,AJRR,,0.02,0.081375
4,Advisory - Spark,,,0.0


# Win Rate Calculation

## Win Rate CSV

In [10]:
if win_rate_csv:
    win_rate_modifier = pd.read_csv('win_rate_modifier.csv')
    win_rate_modifier

In [11]:
if win_rate_csv:
    supply_demand_all_products_pipe_raw_win_merged = supply_demand_all_products_pipe_raw.merge(win_rate_modifier, on='Product Category', how='left')\
        .rename(columns={'Pipeline Modifier': 'Win Rate'})
    # fill na with 0.3
    supply_demand_all_products_pipe_raw_win_merged['Win Rate'] = supply_demand_all_products_pipe_raw_win_merged['Win Rate'].fillna(0.1)

## Win Rate Calculated

In [12]:
if not win_rate_csv:
    ofph_monthly = ofph[(ofph['Opportunity Owner'] != 'Michelle Galvan') &
                        (ofph['Type'].isin(['New Client', 'Existing Client - New Service Line', 'Existing Client - Service Line Expansion'])) &
                        (ofph['Exclude from Resource Requests'] == 'false')].copy()
    ofph_monthly_idx_reset = ofph_monthly.reset_index()
    # filter out any Technology category
    ofph_monthly_idx_reset = ofph_monthly_idx_reset[~ofph_monthly_idx_reset['Category'].str.contains('Technology')]
    ofph_monthly_idx_reset.loc[:, 'Category'] = np.where((ofph_monthly_idx_reset['Category'] == 'Cancer') & 
                                                ((ofph_monthly_idx_reset['Case Type'].str.contains('Follow up')) |
                    (ofph_monthly_idx_reset['Case Type'].str.contains('Case Finding'))), 'Cancer - CDSS', 
                    np.where((ofph_monthly_idx_reset['Category'] == 'Cancer') & 
                                                (~(ofph_monthly_idx_reset['Case Type'].str.contains('Follow up')) &
                    ~(ofph_monthly_idx_reset['Case Type'].str.contains('Case Finding'))), 'Cancer - Abstractor', ofph_monthly_idx_reset['Category']))
    # groupby Opportunity ID and get the first value from 'Age', 'Backlog', and 'Hospital Count'
    abhc = ofph_monthly_idx_reset.groupby(['Opportunity ID', 'Category']).first()[['Opportunity Start Date', 'Close Date', 'Stage']]
    # group by Opportunity ID and calculate the sum for all the other columns
    rest = ofph_monthly_idx_reset.groupby(['Opportunity ID', 'Category'])[['Contracted Weekly Hours', 'Implemented Weekly Hours']].sum()
    # merge the 2 back together
    ofph_monthly_idx_reset_merged = pd.merge(abhc, rest, left_index=True, right_index=True)
    ofph_monthly_ready = ofph_monthly_idx_reset_merged.reset_index()
    ofph_snapshots = create_monthly_snapshots(ofph_monthly_ready, '8/1/2022', datetime.now())
    ofph_snapshots.loc[:, 'perc_realized'] = ofph_snapshots['Implemented Weekly Hours'] / ofph_snapshots['Contracted Weekly Hours'] 
    cat = ofph_snapshots.loc[(ofph_snapshots['Month'].year == datetime.now().year), ['Category', 'Contracted Weekly Hours', 'Implemented Weekly Hours']].groupby('Category').mean()
    cat.loc[:, 'perc_realized'] = cat['Implemented Weekly Hours'] / cat['Contracted Weekly Hours']

6684


## Implemented Vs Contracted Hours

In [42]:
if not win_rate_csv:
    con_imp_1 = ofph_fil_only_closed.reset_index()
    con_imp_1.loc[:, 'Category'] = np.where((con_imp_1['Category'] == 'Cancer') & 
                                                ((con_imp_1['Case Type'].str.contains('Follow up')) |
                    (con_imp_1['Case Type'].str.contains('Case Finding'))), 'Cancer - CDSS', 
                    np.where((con_imp_1['Category'] == 'Cancer') & 
                                                (~(con_imp_1['Case Type'].str.contains('Follow up')) &
                    ~(con_imp_1['Case Type'].str.contains('Case Finding'))), 'Cancer - Abstractor', con_imp_1['Category']))
    
    
    con_imp = con_imp_1.loc[(con_imp_1['Probability (%)'] > 0), 
                                        ['Category', 'Contracted Weekly Hours', 'Implemented Weekly Hours']]\
        .groupby('Category').sum()[['Contracted Weekly Hours', 'Implemented Weekly Hours']]
    con_imp.loc[:, 'diff'] = con_imp['Contracted Weekly Hours'] - con_imp['Implemented Weekly Hours']
    con_imp.loc[:, 'diff_perc'] = round(con_imp['diff'] / con_imp['Contracted Weekly Hours'], 4)
    con_imp.loc[:, 'diff'] = round(con_imp.loc[:, 'diff'], 4)
    # merge with the cats_w_winning_probabilities_grouped_cat dataframe on 'Product Category'
    win_probs_hours_conversion = cats_w_winning_probabilities_grouped_cat.merge(con_imp, on='Category', how='left')
    print(win_probs_hours_conversion.head())

                  Category  Predicted Win Probability  \
0            AFib Ablation                   0.218357   
1                     AJRR                   0.003175   
2         Advisory - Spark                   0.130506   
3    Advisory - Validation                   0.095330   
4  American Spine Registry                   0.004267   

   Contracted Weekly Hours  Implemented Weekly Hours   diff  diff_perc  
0                      NaN                       NaN    NaN        NaN  
1                      NaN                       NaN    NaN        NaN  
2                    0.000                      0.00  0.000        NaN  
3                    2.325                      2.35 -0.025    -0.0108  
4                    6.977                      6.99 -0.013    -0.0019  


## Closing Age

In [59]:
if not win_rate_csv:
  important_features_closing_age = ['Opportunity ID', ## String
                        'Age', # Int
                        'Type', # String
                        'Category', # String
                        'Contracted Weekly Hours', # Float
                        'Backlog', # Bool
                        'Contracted Opportunity ACV', # Float
                        'Hospital Count', # Int
                        'Close Date',
                        'Stage'
                      #   'Implemented Weekly Hours', # Float choosing Contracted for classification
                        ]
  cats_gr25 = ['CM',
  'GWTG Stroke',
  'Cancer - CDSS',
  'Cancer - Abstractor',
  'PCI',
  'Infection Prevention',
  'Trauma',
  'TVT',
  'STS-ACS',
  'LAAO',
  'VQI',
  'Chest Pain MI',
  'GWTG HF',
  'MBSAQIP',
  'GWTG-CAD',
  'NSQIP',
  'ICD',
  'STS-GTS']
  ofph_fil_closing_age = ofph[(ofph['Opportunity Owner'] != 'Michelle Galvan') &
                  (ofph['Type'].isin(['New Client', 'Existing Client - New Service Line', 'Existing Client - Service Line Expansion'])) &
                  (ofph['Exclude from Resource Requests'] == 'false')].copy()
  ofph_index_reset = ofph_fil_closing_age.reset_index()
  ofph_index_reset.loc[:, 'Category'] = np.where((ofph_index_reset['Category'] == 'Cancer') & 
                                                ((ofph_index_reset['Case Type'].str.contains('Follow up')) |
                    (ofph_index_reset['Case Type'].str.contains('Case Finding'))), 'Cancer - CDSS', 
                    np.where((ofph_index_reset['Category'] == 'Cancer') & 
                                                (~(ofph_index_reset['Case Type'].str.contains('Follow up')) &
                    ~(ofph_index_reset['Case Type'].str.contains('Case Finding'))), 'Cancer - Abstractor', ofph_index_reset['Category']))
  ofph_index_reset_select_cats = ofph_index_reset[ofph_index_reset['Category'].isin(cats_gr25)].copy()
  # convert the Backlog string column to a boolean by changing the true values to 1 and the false values to 0
  ofph_index_reset_select_cats['Backlog'] = ofph_index_reset_select_cats['Backlog'].str.replace('true', '1').str.replace('false', '0').astype(int)
  ofph_index_reset_sel = ofph_index_reset_select_cats[important_features_closing_age]

  # groupby Opportunity ID and get the first value from 'Age', 'Backlog', and 'Hospital Count'
  abhc = ofph_index_reset_sel.groupby(['Opportunity ID', 'Category']).first()[['Age', 'Stage', 'Backlog', 'Hospital Count', 'Contracted Opportunity ACV', 'Close Date']]
  # group by Opportunity ID and calculate the sum for all the other columns
  rest = ofph_index_reset_sel.drop(['Age', 'Stage', 'Backlog', 'Hospital Count', 
                                    'Contracted Opportunity ACV', 'Close Date'], axis=1).groupby(['Opportunity ID', 'Category']).sum()
  # merge the 2 back together
  ofph_index_reset_sel_merged = pd.merge(abhc, rest, left_index=True, right_index=True)
  ofph_index_reset_sel_merged.loc[:, 'event'] = ((ofph_index_reset_sel_merged['Stage'] == 'Closed Lost') |
                                                (ofph_index_reset_sel_merged['Stage'] == 'Contract Signed (Implementing)')).astype(int)
  merged_ready = ofph_index_reset_sel_merged.reset_index()
  print(merged_ready.head())

    Opportunity ID              Category  Age              Stage  Backlog  \
0  0065b00000xQes9  Infection Prevention  126        Closed Lost        0   
1  0065b00000xQqCj           GWTG Stroke  132  Customer Invoiced        0   
2  0065b00000xQuCg                   VQI   62        Closed Lost        1   
3  0065b00000xRmkG                Trauma  217        Closed Lost        0   
4  0065b00000xRpLn                   VQI   37  Customer Invoiced        0   

   Hospital Count  Contracted Opportunity ACV Close Date  \
0             1.0                    82500.00 2022-12-06   
1             1.0                    42906.65 2022-12-12   
2             1.0                    18126.90 2022-10-03   
3             1.0                    44956.56 2023-03-08   
4             1.0                     6932.19 2022-09-09   

                                                Type  Contracted Weekly Hours  \
0                 Existing Client - New Service Line                    0.004   
1  Existing Cl

In [61]:
if not win_rate_csv:
    kmf = KaplanMeierFitter()
    # Assuming you have a DataFrame 'opportunities' with columns:
    # 'Age': time since opportunity creation
    # 'Closed': 1 if closed, 0 if still open
    # 'ClosureTime': time to closure for closed opportunities, current age for open ones

    # Fit KaplanMeierFitter on closed opportunities
    closed_opps = merged_ready[merged_ready['event'] == 1].copy()
    kmf.fit(durations=closed_opps['Age'], event_observed=closed_opps['event'])

    # Predict for all opportunities
    all_times = merged_ready['Age']
    survival_probabilities = kmf.predict(all_times)

    # Convert survival probabilities to closure probabilities
    closure_probabilities = 1 - survival_probabilities
    closure_probabilities = closure_probabilities.reset_index().rename(columns={'index': 'Duration',
                                                                                'KM_estimate': 'age_closure_prob'})


    # Add probabilities to the original DataFrame
    merged_ready_w_KM_estimate = pd.merge(merged_ready, closure_probabilities, left_index=True, right_index=True)

    # For open opportunities, you can now access their closure probabilities
    open_opps = merged_ready_w_KM_estimate[merged_ready_w_KM_estimate['event'] == 0].copy()
    open_opps_with_probs = open_opps[['Category', 'age_closure_prob']].groupby('Category').mean().reset_index()

    print(open_opps_with_probs.head())

              Category  age_closure_prob
0                   CM          0.300527
1  Cancer - Abstractor          0.431518
2        Cancer - CDSS          0.411997
3        Chest Pain MI          0.271109
4              GWTG HF          0.339901


In [62]:
if not win_rate_csv:
    # merge open_opps_with_probs with win_probs_hours_conversion
    win_probs_hours_conversion_closing_age = win_probs_hours_conversion.merge(open_opps_with_probs, on='Category', how='left')
    
    print(win_probs_hours_conversion_closing_age.head())


                  Category  Predicted Win Probability  \
0            AFib Ablation                   0.218357   
1                     AJRR                   0.003175   
2         Advisory - Spark                   0.130506   
3    Advisory - Validation                   0.095330   
4  American Spine Registry                   0.004267   

   Contracted Weekly Hours  Implemented Weekly Hours   diff  diff_perc  \
0                      NaN                       NaN    NaN        NaN   
1                      NaN                       NaN    NaN        NaN   
2                    0.000                      0.00  0.000        NaN   
3                    2.325                      2.35 -0.025    -0.0108   
4                    6.977                      6.99 -0.013    -0.0019   

   age_closure_prob  
0               NaN  
1               NaN  
2               NaN  
3               NaN  
4               NaN  


In [63]:
if not win_rate_csv:
    # bringing back to the rest of the data
    supply_demand_all_products_pipe_raw_win_merged = supply_demand_all_products_pipe_raw.merge(win_probs_hours_conversion_closing_age, 
                                                                                               left_on='Product Category', right_on='Category', how='left'
                                                                                               ).drop(columns=['Category'])
    supply_demand_all_products_pipe_raw_win_merged.loc[:, "Win Rate"] = np.nanprod(
        supply_demand_all_products_pipe_raw_win_merged[['age_closure_prob', 'Predicted Win Probability']],
        axis=1
    )
    # if both 'age_closure_prob' and 'Predicted Win Probability' are null, then the win rate is null
    supply_demand_all_products_pipe_raw_win_merged['Win Rate'] = np.where(
        (supply_demand_all_products_pipe_raw_win_merged['age_closure_prob'].isnull() &
        supply_demand_all_products_pipe_raw_win_merged['Predicted Win Probability'].isnull()),
        np.nan,
        supply_demand_all_products_pipe_raw_win_merged['Win Rate']
    )
    # fill na with 0.1
    supply_demand_all_products_pipe_raw_win_merged['Win Rate'] = supply_demand_all_products_pipe_raw_win_merged['Win Rate'].fillna(0.1)

# Adjusting Pipeline

In [64]:
# Pipeline Adjusted
if win_rate_csv:
    supply_demand_all_products_pipe_raw_win_merged.loc[:, "Pipeline Adjusted"] = supply_demand_all_products_pipe_raw_win_merged['Pipe Raw'] * \
    supply_demand_all_products_pipe_raw_win_merged['Win Rate']
    print(supply_demand_all_products_pipe_raw_win_merged.head())
else:
    # this calculates the new Pipeline adjusted column, 
    # it multiplies the Pipe raw by the win rate then since Pipe Raw is negative it subtracts the amount that we can 
    # expect when you multiply the Pipe Raw by the average percent difference between 
    # the historic Contracted Weekly hours and the actual implemented weekly hours
    supply_demand_all_products_pipe_raw_win_merged.loc[:, "Pipeline Adjusted"] = (supply_demand_all_products_pipe_raw_win_merged['Pipe Raw'] * \
    supply_demand_all_products_pipe_raw_win_merged['Win Rate']) + \
        supply_demand_all_products_pipe_raw_win_merged[['diff_perc', 'Pipe Raw']].apply(prod_with_nan_as_zero, axis=1)
    print(supply_demand_all_products_pipe_raw_win_merged.head())

   Product Category  Demand_FTE  Capacity_FTE  Pipe Raw  \
0            ACD CP        0.30           NaN       NaN   
1            ACD HF        0.03          0.28       NaN   
2     AFib Ablation         NaN           NaN  0.241725   
3              AJRR         NaN          0.02  0.081375   
4  Advisory - Spark         NaN           NaN  0.000000   

   Predicted Win Probability  Contracted Weekly Hours  \
0                        NaN                      NaN   
1                        NaN                      NaN   
2                   0.218357                      NaN   
3                   0.003175                      NaN   
4                   0.130506                      0.0   

   Implemented Weekly Hours  diff  diff_perc  age_closure_prob  Win Rate  \
0                       NaN   NaN        NaN               NaN  0.100000   
1                       NaN   NaN        NaN               NaN  0.100000   
2                       NaN   NaN        NaN               NaN  0.218357  

# Rolling Attrition

In [31]:
rolling_last_day = datetime.now().date()
monthly_last_day = datetime.now().date() - timedelta(days=datetime.now().day)
# get all the contacts who their last day was in the last 3 months
ab_rolling = queries.convert_salesforce_data_to_df(queries.sf.query(queries.create_query(
    columns="Name,External_Employee_ID__c,Id,Last_Date__c,Full_Time_Status__c,Resource_Role__c,Group_Name__c,Week_Hours__c,Business_Leader__r.Name",
    table="Contact", 
    conditions={"AccountId": "001j000000i0d9tAAA",
                "Last_Date__c >": (datetime.now() - relativedelta(months=3)).date(), 
                "Last_Date__c <": rolling_last_day,
                "QC_Active__c": False,
                "PT_Action_Plan__c !": "Plan to Separate"})))#['records']).drop(['attributes'],axis=1)
ab_monthly = queries.convert_salesforce_data_to_df(queries.sf.query(queries.create_query(
    columns="Name,External_Employee_ID__c,Id,Last_Date__c,Full_Time_Status__c,Resource_Role__c,Group_Name__c,Week_Hours__c,Business_Leader__r.Name",
    table="Contact", 
    conditions={"AccountId": "001j000000i0d9tAAA",
                "Last_Date__c >": (last_day_of_month((datetime.now() - relativedelta(months=4)).strftime("%Y-%m-%d")) + relativedelta(days=1)).date(), 
                "Last_Date__c <": monthly_last_day,
                "QC_Active__c": False,
                "PT_Action_Plan__c !": "Plan to Separate"})))

In [32]:
def attrition_categories_fx(abstractors, last_day_of_month):
    abstractors = abstractors[(abstractors['Resource_Role__c'] == 'Senior Clinical Data Specialist') | 
                            (abstractors['Resource_Role__c'] == 'Clinical Data Support Specialist') |
                            (abstractors['Resource_Role__c'] == 'Clinical Data Specialist')]
    # convert Week_Hours__c to int64
    abstractors.loc[:, 'Week_Hours__c'] = abstractors['Week_Hours__c'].astype('Int64')
    # if any abstractors have a Full_Time_Status__c of PT and a Business_Leader__r.Name of Paul Gasque and are not in the "Infection Prevention" group then divide 
    # their 'Week_Hours__c' by 2
    abstractors.loc[(abstractors['Full_Time_Status__c'] == 'PT') &
                    (abstractors['Business_Leader__r.Name'] == 'Paul Gasque') &
                    (abstractors['Group_Name__c'] != 'Infection Prevention'), 'Week_Hours__c'] = abstractors['Week_Hours__c'] / 2
    skills_report = rf.get_report("00Oan0000033WclEAE", id_column='row_id')
    skills_report_abs_of_interest = skills_report[skills_report['Contact ID (Case Safe)'].isin(abstractors['Id'].unique())].copy()
    valid_skills = skills_report_abs_of_interest[skills_report_abs_of_interest['Product Skill Rating'].str.contains(r"0|1|2|6")].copy()
    valid_skills_dups_drp = valid_skills[['Contact ID (Case Safe)', 'Related Product Skill Category']].drop_duplicates()
    # groupby cbiz_name and concatenate the 'Related Product Skill Category'
    valid_skills_current_skills = valid_skills_dups_drp.groupby('Contact ID (Case Safe)')['Related Product Skill Category']\
        .apply(lambda x: ",".join(x)).reset_index()

    attrition_in_these_cats_all = None
    attrition_in_these_cats_all_30 = None
    check_dict = {}
    # loop through all the rows in abstractors
    for i in range(len(abstractors)):
        group_name = abstractors.iloc[i, list(abstractors.columns).index("Group_Name__c")]
        name = abstractors.iloc[i, list(abstractors.columns).index("Name")]
        # get the last_day and abstractor
        last_day = abstractors.iloc[i, list(abstractors.columns).index("Last_Date__c")]
        id = abstractors.iloc[i, list(abstractors.columns).index("Id")]
        role = abstractors.iloc[i, list(abstractors.columns).index("Resource_Role__c")]
        hours = abstractors.iloc[i, list(abstractors.columns).index("Week_Hours__c")]
        # convert last_day to datetime
        last_day = datetime.strptime(last_day, '%Y-%m-%d')
        try:
            assignments = queries.convert_salesforce_data_to_df(queries.sf.query(queries.create_query(
                columns="Id,Resource__r.Name,QC_Team__r.Name,Planned_Hours__c,Team_Position__c,End_Date__c,Team_Category__c",
                table="Assignment__c", 
                conditions={#"End_Date__c >": (last_day - timedelta(days=8)).date(),
                            "Planned_Hours__c >": 0.5,
                            "Resource__r.Id": id})))
            # filter out any assignments that are not in their current skillset and have a value of 0,1,2,or 6
            assignments = assignments[(assignments['Team_Category__c'].isin(
                valid_skills_current_skills.loc[valid_skills_current_skills['Contact ID (Case Safe)'] == id, 
                                                'Related Product Skill Category'].str.split(",").values[0]))]
        except Exception as e:
            print(f'There was an error with {abstractors.iloc[i, list(abstractors.columns).index("Name")]}: {e}')
            attrition_in_these_cats = pd.DataFrame(data={'Planned_Hours__c':0, 'total_hours_planned':0, 'proportion':0, 'Hours_Lost':hours},
                                                    columns=['Planned_Hours__c', 'total_hours_planned', 'proportion', 'Hours_Lost'], 
                                                index=valid_skills_current_skills.loc[
                                                    valid_skills_current_skills['Contact ID (Case Safe)'] == id, 
                                                    'Related Product Skill Category'].str.split(",").values[0])
            if (group_name == 'Cancer') & (role == 'Clinical Data Support Specialist'):
                # replace any value in the index that is 'Cancer' with 'Cancer - CDSS'
                attrition_in_these_cats.index = attrition_in_these_cats.index.map(lambda x: x if x != 'Cancer' else
                                                                                'Cancer - CDSS')
            elif (group_name == 'Cancer') & (role != 'Clinical Data Support Specialist'):
                attrition_in_these_cats.index = attrition_in_these_cats.index.map(lambda x: x if x != 'Cancer' else
                                                                                'Cancer - Abstractor')
            # print(attrition_in_these_cats)
            try:
                attrition_in_these_cats_all = pd.concat([attrition_in_these_cats_all, attrition_in_these_cats])
            except:
                attrition_in_these_cats_all = attrition_in_these_cats
            if last_day.date() >= (last_day_of_month - relativedelta(months=1)):
                try:
                    attrition_in_these_cats_all_30 = pd.concat([attrition_in_these_cats_all_30, attrition_in_these_cats])
                except:
                    attrition_in_these_cats_all_30 = attrition_in_these_cats
            continue
        assignments.loc[:, "Team_Category__c"] = np.where(
            (role == 'Clinical Data Support Specialist') & (assignments["Team_Category__c"] == "Cancer"), "Cancer - CDSS", 
            np.where((role != 'Clinical Data Support Specialist') & (assignments["Team_Category__c"] == "Cancer"), "Cancer - Abstractor", 
                    assignments["Team_Category__c"]))
        assignments.loc[:, 'total_hours_planned'] = assignments['Planned_Hours__c'].sum()

        # groupby team_category__c and find the proportion of planned_hours__c per category out of the total_hours_planned
        attrition_in_these_cats = assignments.groupby("Team_Category__c").agg({
            'Planned_Hours__c': 'sum','total_hours_planned': 'first'}).assign(
                proportion = lambda x: x['Planned_Hours__c'] / x['total_hours_planned'] if x['total_hours_planned'].all() != 0 else 0)

        # multiply proportion by hours to get the number of hours per category
        attrition_in_these_cats.loc[:, "Hours_Lost"] = attrition_in_these_cats['proportion'] * hours
        if group_name.strip() == 'Cancer':
            check_dict[name] = attrition_in_these_cats
            # print(attrition_in_these_cats)
        try:
            attrition_in_these_cats_all = pd.concat([attrition_in_these_cats_all, attrition_in_these_cats])
        except:
            attrition_in_these_cats_all = attrition_in_these_cats
        if last_day.date() >= (last_day_of_month - relativedelta(months=1)):
            try:
                attrition_in_these_cats_all_30 = pd.concat([attrition_in_these_cats_all_30, attrition_in_these_cats])
            except:
                attrition_in_these_cats_all_30 = attrition_in_these_cats
    # regroup by the index and find the sum of Planned_Hours__c
    attrition_in_these_cats_all_gr = attrition_in_these_cats_all.groupby(attrition_in_these_cats_all.index).sum()[['Hours_Lost']]
    attrition_in_these_cats_all_gr = attrition_in_these_cats_all_gr.rename(columns={"Hours_Lost": "Attrit, [n]m"})
    # divide by 40 for conversion to FTE and 3 for monthly calc
    attrition_in_these_cats_all_gr.loc[:, "Attrit, [n]m"] = attrition_in_these_cats_all_gr['Attrit, [n]m'] / 40 / 3
    attrition_in_these_cats_all_30_gr = attrition_in_these_cats_all_30.groupby(attrition_in_these_cats_all_30.index).sum()[['Hours_Lost']]
    attrition_in_these_cats_all_30_gr = attrition_in_these_cats_all_30_gr.rename(columns={"Hours_Lost": "30-days Attrit, [n]m"})
    # divide by 40 for conversion to FTE
    attrition_in_these_cats_all_30_gr.loc[:, "30-days Attrit, [n]m"] = attrition_in_these_cats_all_30_gr['30-days Attrit, [n]m'] / 40

    return attrition_in_these_cats_all_gr, attrition_in_these_cats_all_30_gr

attrition_in_these_cats_all_gr_rolling, attrition_in_these_cats_all_30_gr_rolling = attrition_categories_fx(ab_rolling, rolling_last_day)
attrition_in_these_cats_all_gr_monthly, attrition_in_these_cats_all_30_gr_monthly = attrition_categories_fx(ab_monthly, monthly_last_day)

There was an error with Farrah Scodius: "['attributes'] not found in axis"
There was an error with Lori Kinnunen: "['attributes'] not found in axis"
There was an error with Farrah Scodius: "['attributes'] not found in axis"
There was an error with Lori Kinnunen: "['attributes'] not found in axis"


# Final Merge

In [66]:
# Rolling attrition joining
supply_demand_all_products_pipe_merged_1_rolling = pd.merge(supply_demand_all_products_pipe_raw_win_merged, attrition_in_these_cats_all_gr_rolling, 
                                               left_on='Product Category', right_index=True, how='left')#.drop('Category', axis=1)
supply_demand_all_products_pipe_merged_rolling = pd.merge(supply_demand_all_products_pipe_merged_1_rolling, attrition_in_these_cats_all_30_gr_rolling, 
                                               left_on='Product Category', right_index=True, how='left')#.drop('Category', axis=1)
# monthly attrition join
supply_demand_all_products_pipe_merged_1_monthly = pd.merge(supply_demand_all_products_pipe_raw_win_merged, attrition_in_these_cats_all_gr_monthly, 
                                               left_on='Product Category', right_index=True, how='left')#.drop('Category', axis=1)
supply_demand_all_products_pipe_merged_monthly = pd.merge(supply_demand_all_products_pipe_merged_1_monthly, attrition_in_these_cats_all_30_gr_monthly, 
                                               left_on='Product Category', right_index=True, how='left')#.drop('Category', axis=1)

# replace all of the NaN with 0
supply_demand_all_products_pipe_adj_rolling = final_column_adj_fx(supply_demand_all_products_pipe_merged_rolling)
supply_demand_all_products_pipe_adj_monthly = final_column_adj_fx(supply_demand_all_products_pipe_merged_monthly)


In [67]:
if win_rate_csv:
    filename = f"{path_to_staffing_models_folder}staffing_model_{today}_csv_wr.xlsx"
else:
    filename = f"{path_to_staffing_models_folder}staffing_model_{today}_calc_wr.xlsx"

In [68]:
writer = pd.ExcelWriter(filename, engine='xlsxwriter')
supply_demand_all_products_pipe_adj_rolling.to_excel(writer, sheet_name='Rolling Attrition', index=False)
supply_demand_all_products_pipe_adj_monthly.to_excel(writer, sheet_name='Monthly Attrition', index=False)
writer.close()
create_capbase.adjust_workbook_column_widths(filename)