# Setup

## Imports

In [304]:
import operator
import pandas as pd, numpy as np, matplotlib.pyplot as plt
import os, warnings
import pyodbc

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score

from sqlalchemy import create_engine, text
from dotenv import load_dotenv

## SQL server connection

In [305]:
drivers = pyodbc.drivers()

In [306]:
# List all available ODBC drivers to confirm it's there
drivers = pyodbc.drivers()
print("Available ODBC drivers:")
for driver in drivers:
    print(driver)

Available ODBC drivers:
SQL Server
SQL Server Native Client 11.0
ODBC Driver 17 for SQL Server
Microsoft Access Driver (*.mdb, *.accdb)
Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)
Microsoft Access Text Driver (*.txt, *.csv)


In [307]:
nums = [int(word) for driver in drivers for word in driver.split() if word.isdigit() and 'ODBC Driver' in driver]
version_no = max(nums)

In [308]:
# Load environment variables from .env file
load_dotenv()
# Get environment variables
db_username = os.getenv('DB_USERNAME')
db_password = os.getenv('DB_PASSWORD')
# SQLAlchemy connection string
connection_string = f"mssql+pyodbc://{db_username}:{db_password}@52.149.218.135/Vision?driver=ODBC+Driver+{version_no}+for+SQL+Server&TrustServerCertificate=yes"

In [309]:
# Create a SQLAlchemy engine
engine = create_engine(connection_string, pool_pre_ping=True)

##### **make sure you are connected to the VPN or this won't work!!!**

In [310]:
# Test Connection
try:
    # Connect to the database
    with engine.connect() as connection:
        # Execute a simple query
        result = connection.execute(text("SELECT 1"))
        # Fetch the result (not always necessary, but good for verification)
        print(result)
    print("Connection successful.")
except Exception as e:
    print("Error during connection:", str(e))

<sqlalchemy.engine.cursor.CursorResult object at 0x00000243156D5DE0>
Connection successful.


## Create Dataframes

In [311]:
# Use the engine to create the DataFrames we want
# df_CL = pd.read_sql_query("SELECT * FROM [dbo].[CL]", engine)
# df_PR = pd.read_sql_query("SELECT * FROM [dbo].[PR]", engine)
# df_LD = pd.read_sql_query("SELECT * FROM [dbo].[LD]", engine)

## see note below

I will take it from soup to nuts, or SQL to Pickle as it may be, in the final notebook, but for now I'm gonna skip reproducing the cleaning steps here by reading in the pkls that I created in the EDA and Data Prep notebook.  I usually keep them fresh within a range of a couple days to a week anyway, it just takes a fair bit of time to run through the steps so I don't want it slowing me down at this stage.

In [312]:
# Pickle and read back in for efficiency

df_CL = pd.read_pickle('Vision_Data/working_data/CL_WIPCOPY_1220.pkl')
df_PR = pd.read_pickle('Vision_Data/working_data/PR_WIPCOPY_1220.pkl')
## those two need a refresh soon, I haven't done the whole thing from start to finish since 
## last month when we first integrated the SQL connection
df_LD = pd.read_pickle('Vision_Data/working_data/LD_WIPCOPY_0108.pkl')

In [314]:
# Display the first few rows of each DataFrame
# df_CL.info()
# df_PR.info()
df_LD.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035282 entries, 0 to 1035281
Data columns (total 48 columns):
 #   Column                   Non-Null Count    Dtype         
---  ------                   --------------    -----         
 0   Period                   1035282 non-null  int64         
 1   PKey                     1035282 non-null  object        
 2   WBS1                     1035282 non-null  object        
 3   WBS2                     1035282 non-null  object        
 4   WBS3                     1035282 non-null  object        
 5   LaborCode                1035282 non-null  object        
 6   Employee                 1035282 non-null  object        
 7   TransDate                1035282 non-null  datetime64[ns]
 8   Name                     1035282 non-null  object        
 9   RegHrs                   1035282 non-null  float64       
 10  OvtHrs                   1035282 non-null  float64       
 11  RegAmt                   1035282 non-null  float64       
 12  

### Basic EDA and prep

In [315]:
# make pd.series of all P## project codes, R## project codes, and OHD codes in LD

ohd_codes = df_LD[df_LD['WBS1'].str.startswith('OHD')]['WBS1'].value_counts()
# 376429 entries in 134 unique OHD codes
p_codes = df_LD[df_LD['WBS1'].str.startswith('P')]['WBS1'].value_counts()
# 84754 entries in 9040 unique P codes
r_codes = df_LD[df_LD['WBS1'].str.startswith('R')]['WBS1'].value_counts()
# 574099 entries in 4630 unique R codes

In [316]:
## found 000. and ZZZ. codes in PR, but are they in LD too?
# using ~ to do inverse, ie str[0 is NOT in []]
df_LD[~df_LD['WBS1'].str[0].isin(['O', 'P', 'R'])]
# only those three

Unnamed: 0,Period,PKey,WBS1,WBS2,WBS3,LaborCode,Employee,TransDate,Name,RegHrs,...,RateBillingCurrency,OvtRateBillingCurrency,RegAmtEmployeeCurrency,OvtAmtEmployeeCurrency,RateEmployeeCurrency,OvtRateEmployeeCurrency,XferCategory,NonBill,TransferredPeriod,TransferredBillStatus


In [317]:
print(f"{len(ohd_codes) / len(df_LD['WBS1'].unique()) * 100}% of WBS1 codes are OHD")
print(f"{len(p_codes) / len(df_LD['WBS1'].unique()) * 100}% of WBS1 codes are P")
print(f"{len(r_codes) / len(df_LD['WBS1'].unique()) * 100}% of WBS1 codes are R")
print('')
print(f"{sum(ohd_codes.values) / df_LD.shape[0] * 100}% of all entries are marked as OHD")
print(f"{sum(p_codes.values) / df_LD.shape[0] * 100}% of all entries are marked as P")
print(f"{sum(r_codes.values) / df_LD.shape[0] * 100}% of all entries are marked as R")

0.9707331208345407% of WBS1 codes are OHD
65.48826427122573% of WBS1 codes are P
33.54100260793973% of WBS1 codes are R

36.36004489598003% of all entries are marked as OHD
8.186561729074784% of all entries are marked as P
55.45339337494518% of all entries are marked as R


In [318]:
# df.columns # pick the most immediately relevant ones

df_LD[['RegAmt', 'Rate', 'BillExt', 'Category',  'ChargeType', 'RateType', 'BillStatus', 'XferWBS1']].head()
# active_df = df_LD[['RegAmt', 'Rate', 'BillExt', 'Category',  'ChargeType', 'RateType', 'BillStatus', 'XferWBS1']].head()

# less relevant but I'm gonna note them here for easy access if I need to add them later
# 'OvtAmt', 'OvtPct', 'OvtRate', -- only if there is Ovt I will need to look at these
# 'EmOrg', 'PrOrg', -- not sure these tell me anything I cnre about
# 'BilledWBS2', 'BilledWBS3', only if the Bi
# 'BilledInvoice', 'XferWBS2', 'XferWBS3', 'XferLaborCode'

Unnamed: 0,RegAmt,Rate,BillExt,Category,ChargeType,RateType,BillStatus,XferWBS1
0,82.96,41.48,290.0,70,R,S,F,R01.2012.007387
1,-35.5,35.5,-200.0,20,R,S,T,
2,35.5,35.5,145.0,70,R,S,F,R01.2012.007387
3,-106.5,35.5,-600.0,20,R,S,T,
4,106.5,35.5,435.0,70,R,S,F,R01.2012.007387


In [None]:
## what are those Xfer WBSes about?  not a high priority but definitely do dig deeper into this at some point

Here's what the various Bill statuses mean and counts (uncomment and run again to update counts periodically).

The transfers will need further investigation, merge them somehow with their alt WBS1s, since as far as we're concerned here that's one project.  

Writeoffs (and possibly Held) most likely indicate time "wasted" but I may need to confirm this more definitively.
What I'm really wondering about is the large number of Billable but not Final Billed entries.  Are they just not up to date, or are they rolled under another entry that IS final billed?  The latter seems likely, but I'll have to look deeper.

##### Checking nulls out

In [152]:
def nullcheck(df):
    for col in df.columns:
        if df[col].isna().sum() > 0:
            print(f"Column '{col}' is {df[col].isna().sum() / df.shape[0] * 100}% nulls")
nullcheck(df_LD)

Column 'BilledInvoice' is 59.43849115506693% nulls
Column 'XferWBS1' is 93.61333433789054% nulls
Column 'XferWBS2' is 93.61333433789054% nulls
Column 'XferWBS3' is 93.61333433789054% nulls
Column 'XferLaborCode' is 93.61333433789054% nulls
Column 'NonBill' is 98.06642055014963% nulls
Column 'TransferredBillStatus' is 98.39628236557769% nulls


BilledWBS1 is 59.4% null.  What do the nulls mean here?  Investigate further.

In [20]:
df_LD['BilledWBS1'].value_counts()
# df_LD['WBS2'].isna().sum()

BilledWBS1
R01.2004.004094    14276
R05.2015.000986     9048
R05.2010.000550     6786
R05.2015.986001     6143
R01.2002.003195     4953
                   ...  
R03.2010.000329        1
R01.2000.02147I        1
R05.2020.001251        1
R10.2019.000341        1
R05.2014.000949        1
Name: count, Length: 4318, dtype: int64

In [21]:
df_LD['WBS1'][df_LD['BilledWBS1'] == 'R05.2015.000986'].unique()

array(['R05.2015.000986'], dtype=object)

In [None]:
## check if there's a correlation here between presence of a BilledWBS1 
# and the value of BillStatus

In [327]:
df_LD.columns[25:50]

Index(['BilledWBS3', 'BilledInvoice', 'BilledPeriod', 'XferWBS1', 'XferWBS2',
       'XferWBS3', 'XferLaborCode', 'RegAmtProjectCurrency',
       'OvtAmtProjectCurrency', 'RateProjectCurrency',
       'OvtRateProjectCurrency', 'RegAmtBillingCurrency',
       'OvtAmtBillingCurrency', 'RateBillingCurrency',
       'OvtRateBillingCurrency', 'RegAmtEmployeeCurrency',
       'OvtAmtEmployeeCurrency', 'RateEmployeeCurrency',
       'OvtRateEmployeeCurrency', 'XferCategory', 'NonBill',
       'TransferredPeriod', 'TransferredBillStatus'],
      dtype='object')

In [381]:
pd.options.display.max_columns = 30
temp_LD = df_LD[['Period', 'WBS1', 'WBS2', 'WBS3', 'LaborCode', 'TransDate',
                 'RegHrs', 'Rate', 'RateType', 'BillStatus', 'BilledWBS1', 'BilledWBS2',
                 'BilledWBS3', 'BilledInvoice', 'BilledPeriod', 'XferWBS1', 'XferWBS2',
                 'XferWBS3', 'XferLaborCode', 'TransferredPeriod', 'TransferredBillStatus']]

In [382]:
temp_LD.head()

Unnamed: 0,Period,WBS1,WBS2,WBS3,LaborCode,TransDate,RegHrs,Rate,RateType,BillStatus,BilledWBS1,BilledWBS2,BilledWBS3,BilledInvoice,BilledPeriod,XferWBS1,XferWBS2,XferWBS3,XferLaborCode,TransferredPeriod,TransferredBillStatus
0,201306,R01.2012.007387,1,1,OFFIC,2012-10-21,2.0,41.48,S,F,R01.2012.007387,1.0,1.0,19961.0,201306,R01.2012.007387,1.0,1.0,OFFIC,0,
1,201306,R01.2012.007387,1,1,MEETG,2012-10-22,-1.0,35.5,S,T,,,,,0,,,,,0,
2,201306,R01.2012.007387,1,1,MEETG,2012-10-22,1.0,35.5,S,F,R01.2012.007387,1.0,1.0,19961.0,201306,R01.2012.007387,1.0,1.0,MEETG,0,
3,201306,R01.2012.007387,1,1,MEETG,2012-10-25,-3.0,35.5,S,T,,,,,0,,,,,0,
4,201306,R01.2012.007387,1,1,MEETG,2012-10-25,3.0,35.5,S,F,R01.2012.007387,1.0,1.0,19961.0,201306,R01.2012.007387,1.0,1.0,MEETG,0,


In [384]:
# temp_LD['BilledWBS1'][temp_LD['BillStatus'] == "B"].isna().sum()
# BilledWBS1 is 100% null where BillStatus == B

# temp_LD['BilledWBS1'][temp_LD['BillStatus'] == "F"].value_counts().sum() # 413941
# no nulls

# now invert that, value counts of BillStatus for the rows where BilledWBS1 is not null
temp_LD['BillStatus'][temp_LD['BilledWBS1'].notnull()].value_counts()

BillStatus
F    413941
X      6237
O        27
Name: count, dtype: int64

In [386]:
413941 + 6237 + 27

420205

In [385]:
temp_LD['BillStatus'][temp_LD['BilledWBS1'].isna()].value_counts()

BillStatus
B    372703
T    118077
W    101111
H     16590
X      4765
M      1729
O        94
D         6
R         2
Name: count, dtype: int64

In [363]:
# # temp_LD['BillStatus'][temp_LD['BilledWBS1'].isna()].value_counts()
# BillStatus
# B    372703
# T    118077
# W    101111
# H     16590
# X      4765
# M      1729
# O        94
# D         6
# R         2

# F never occurs when BilledWBS1 is null
# but 372703 / 372944: Not all null BilledWBS1s are B: (presumably pending billing)
# why doesn't this add up?  why the 241 row difference?

BillStatus
B    372703
T    118077
W    101111
H     16590
X      4765
M      1729
O        94
D         6
R         2
Name: count, dtype: int64

In [378]:
#### reference ####
# len(temp_LD[temp_LD['BillStatus'].notnull()]) 
# 1035282 total rows, all not null
# df_LD['BillStatus'].value_counts()
    
# F    Final Billed    413941
# B    Billable        372944
# T    Transferred    118077
# W    To Write off    101111
# H    Held            16349
# X    Written off    11002
# M    Modified        1729
# O    Deleted        121
# D    To Delete        6
# R    Partial hold    2
# N    Not billable    0 

# temp_LD['BilledWBS1'].isna().sum() # 615077 nulls and 420205 notnull
temp_LD_2 = temp_LD[temp_LD['BilledWBS1'].isna()]
temp_LD_2[temp_LD_2['BillStatus'] != "B"] # 242374 rows == the sum of all not B

Unnamed: 0,Period,WBS1,WBS2,WBS3,LaborCode,TransDate,RegHrs,Rate,RateType,BillStatus,BilledWBS1,BilledWBS2,BilledWBS3,BilledInvoice,BilledPeriod,XferWBS1,XferWBS2,XferWBS3,XferLaborCode,TransferredPeriod,TransferredBillStatus
1,201306,R01.2012.007387,001,001,MEETG,2012-10-22,-1.0,35.50,S,T,,,,,0,,,,,0,
3,201306,R01.2012.007387,001,001,MEETG,2012-10-25,-3.0,35.50,S,T,,,,,0,,,,,0,
5,201306,R01.2012.007387,001,001,OFFIC,2013-01-25,-0.5,45.17,S,T,,,,,0,,,,,0,
7,201306,R01.2012.007387,001,001,MEETG,2013-02-07,-1.0,42.12,S,T,,,,,0,,,,,0,
9,201306,R01.2012.007387,001,001,OFFIC,2013-04-29,-6.0,36.13,S,T,,,,,0,,,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035272,201402,R05.2014.000900,001,001,OFFIC,2014-01-22,-5.0,30.35,S,T,,,,,0,R05.2013.000894,001,001,OFFIC,0,
1035274,201402,R03.2012.000516,002,003,REPRT,2014-02-20,-0.5,50.08,S,T,,,,,0,,,,,0,
1035276,201402,R03.2012.000516,002,003,REPRT,2014-02-21,-0.5,50.08,S,T,,,,,0,,,,,0,
1035278,201402,R03.2012.000516,002,003,REPRT,2014-02-23,-2.0,50.08,S,T,,,,,0,,,,,0,


In [None]:
#### checkpoint ^^

## pared temp_df down to only where BilledWBS1 is null, then where BillStatus != B
## but trying to wrap my head around what it means
## difference between 413941 "F" rows, with values for BilledWBS1,
## and 420205 total rows where BilledWBS1 is not null

## Looks like the difference is the BillStatus X or O.

# SO, BilledWBS1 appears when the status is Final Billed (100% of the time),
# AND when the status is Written Off (~half the time)
# AND ALSO when the status is deleted (between 1/4-1/3 of the time.)

### what does it mean? ###

In [22]:
## first draft version replaced by function version below

# billedval_checklist = []
# unique_billed_values = df_LD['BilledWBS1'].unique()
# # Iterate over unique values
# for billed_value in unique_billed_values:
#     # Get a slice for each unique value
#     slice_df = df_LD[df_LD['BilledWBS1'] == billed_value]
    
#     # Get unique values of 'WBS1' column in the slice
#     unique_wbs_values = slice_df['WBS1'].unique()
    
#     # Check if the length is greater than 1
#     if len(unique_wbs_values) > 1:
#         print(f"Values in 'WBS1' column for 'BilledWBS1' = {billed_value} that do not match:")
        
#         # Print values in 'WBS1' that do not match 'BilledWBS1'
#         for wbs_value in unique_wbs_values:
#             if wbs_value != billed_value:
#                 billedval_checklist.append(wbs_value)
                
# billedval_checklist

In [23]:
def col_equivalence(col1, col2): 
    # col1 is the one we're checking for difference, col2 is the sanity check 
    # (BilledWBS123 vs WBS123, for example)
    # not positive that makes any difference
    checklist = []
    unique_vals1 = df_LD[col1].unique()
    # Iterate over unique values
    for unique_val in unique_vals1:
        # Get a slice for each unique value
        slice_df = df_LD[df_LD[col2] == unique_val]

        # Get unique values of 'WBS1' column in the slice
        unique_vals2 = slice_df[col2].unique()

        # Check if the length is greater than 1
        if len(unique_vals2) > 1:
            print(f"Values in {str(col2)} column for {str(col1)} = {unique_val} that do not match:")

            # Print values in 'WBS1' that do not match 'BilledWBS1'
            for col2_val in unique_vals2:
                if col2_val != unique_val:
                    checklist.append(col2_val)
    return checklist

print(col_equivalence('BilledWBS1', 'WBS1'))

[]


Okay, if the Billed WBS is exactly the same as the WBS1 in all cases, we don't really need it.

In [24]:
df_LD.drop(axis=1, labels=['BilledWBS1', 'BilledWBS2', 'BilledWBS3'], inplace=True)

I wonder if Period and Billed Period are like that too.

In [25]:
print(col_equivalence('Period', 'BilledPeriod'))

[]


In [26]:
df_LD.drop(axis=1, labels=['BilledPeriod'], inplace=True)

#### CHECKPOINT Jan 9
At this point (for now), I'm gonna cut off the LD cleaning and prep and move on to other things.

((stuff in this section will likely need to be moved later for organization, since we're kind of rolling prep, feature engineering, and modeling all together))

In [27]:
# Out:
# df_LD.to_pickle('Vision_Data/working_data/LD_WIPCOPY_0109.pkl')

# In:
df_LD = pd.read_pickle('Vision_Data/working_data/LD_WIPCOPY_0109.pkl')

In [28]:
df_LD.head()

Unnamed: 0,Period,PKey,WBS1,WBS2,WBS3,LaborCode,Employee,TransDate,Name,RegHrs,...,RateBillingCurrency,OvtRateBillingCurrency,RegAmtEmployeeCurrency,OvtAmtEmployeeCurrency,RateEmployeeCurrency,OvtRateEmployeeCurrency,XferCategory,NonBill,TransferredPeriod,TransferredBillStatus
0,201306,423576814212,R01.2012.007387,1,1,OFFIC,114,2012-10-21,"Gilmer, Laurie",2.0,...,41.48,0.0,82.96,0.0,41.48,0.0,20,N,0,
1,201306,423576822120,R01.2012.007387,1,1,MEETG,114,2012-10-22,"Gilmer, Laurie",-1.0,...,35.5,0.0,-35.5,0.0,35.5,0.0,0,,0,
2,201306,423576822121,R01.2012.007387,1,1,MEETG,114,2012-10-22,"Gilmer, Laurie",1.0,...,35.5,0.0,35.5,0.0,35.5,0.0,20,N,0,
3,201306,423576829156,R01.2012.007387,1,1,MEETG,114,2012-10-25,"Gilmer, Laurie",-3.0,...,35.5,0.0,-106.5,0.0,35.5,0.0,0,,0,
4,201306,423576829157,R01.2012.007387,1,1,MEETG,114,2012-10-25,"Gilmer, Laurie",3.0,...,35.5,0.0,106.5,0.0,35.5,0.0,20,N,0,


In [160]:
# df_LD['TransferredPeriod'].value_counts()
# df_LD['TransferredBillStatus'].value_counts()
# df_LD['RateBillingCurrency'].value_counts()
# not a lot of these relative to the whole but enough that I probably shouldn't ignore it.

TransferredBillStatus
B    12977
H     3619
T        4
W        3
Name: count, dtype: int64

In [30]:
df_LD.columns

Index(['Period', 'PKey', 'WBS1', 'WBS2', 'WBS3', 'LaborCode', 'Employee',
       'TransDate', 'Name', 'RegHrs', 'OvtHrs', 'RegAmt', 'OvtAmt', 'BillExt',
       'Rate', 'OvtPct', 'OvtRate', 'Category', 'EmOrg', 'PrOrg', 'ChargeType',
       'RateType', 'BillStatus', 'BilledInvoice', 'XferWBS1', 'XferWBS2',
       'XferWBS3', 'XferLaborCode', 'RegAmtProjectCurrency',
       'OvtAmtProjectCurrency', 'RateProjectCurrency',
       'OvtRateProjectCurrency', 'RegAmtBillingCurrency',
       'OvtAmtBillingCurrency', 'RateBillingCurrency',
       'OvtRateBillingCurrency', 'RegAmtEmployeeCurrency',
       'OvtAmtEmployeeCurrency', 'RateEmployeeCurrency',
       'OvtRateEmployeeCurrency', 'XferCategory', 'NonBill',
       'TransferredPeriod', 'TransferredBillStatus'],
      dtype='object')

In [31]:
# dropping OVT and similar for the time being
LD = df_LD[['Period', 'PKey', 'WBS1', 'WBS2', 'WBS3', 'LaborCode', 'Employee',
       'TransDate', 'Name', 'RegHrs', 'RegAmt', 'BillExt',
       'Rate', 'ChargeType', 'RateType', 'BillStatus', 'BilledInvoice', 'XferWBS1', 'XferWBS2',
       'XferWBS3', 'XferLaborCode', 'RegAmtProjectCurrency', 'RegAmtBillingCurrency',
       'RateBillingCurrency', 'RegAmtEmployeeCurrency', 'RateEmployeeCurrency', 'NonBill']]

In [32]:
# check nulls again rq
nullcheck(df_LD)

Column 'BilledInvoice' is 59.43849115506693% nulls
Column 'XferWBS1' is 93.61333433789054% nulls
Column 'XferWBS2' is 93.61333433789054% nulls
Column 'XferWBS3' is 93.61333433789054% nulls
Column 'XferLaborCode' is 93.61333433789054% nulls
Column 'NonBill' is 98.06642055014963% nulls
Column 'TransferredBillStatus' is 98.39628236557769% nulls


### Basic feature extraction / engineering
- Did we win the project? Y/N
 - How many hours on the P side?
 - How does that compare to the R?
- Did we win the client?
 - When did we win the client?
  - How many hours did we put in to win them?
- Total number projects associated with client
 - projects per year
 
 Now to identify some potentially interesting features:

In [42]:
### not sure where I was going with PR features, revisit later

Features to engineer from df_PR:

In [33]:
df_PR.columns

Index(['WBS1', 'WBS2', 'WBS3', 'StartDate', 'ClientID', 'Name', 'ContractDate',
       'ProjectName', 'PhaseName', 'EndDate', 'ClientName', 'SpecialtyType',
       'Recommend'],
      dtype='object')

In [34]:
df1 = df_PR[['WBS1', 'StartDate', 'ClientID', 'Name', 'ContractDate',
       'ProjectName', 'EndDate', 'ClientName', 'SpecialtyType',
       'Recommend']]

In [35]:
df1.head()

Unnamed: 0,WBS1,StartDate,ClientID,Name,ContractDate,ProjectName,EndDate,ClientName,SpecialtyType,Recommend
0,R01.2012.007235,Aug 13 2012 12:00AM,CONGBETHEM,Additional Services,Aug 3 2012 12:00AM,Congregation Beth Emeth Reserve Study Update,NaT,Congregation Beth Emeth,,N
4,R01.2013.007643,Mar 9 2013 12:00AM,MLEEMAN1248814353279,AREA-DC Portfolio,Mar 1 2013 12:00AM,(CONFIDENTIAL)TIAA-CREF Master Services Agreement,NaT,TIAA-CREF,,N
8,P01.2013.007752,Aug 1 2013 12:00AM,UNIVOFME,Proposal,,University of Maine Business Process Reenginee...,NaT,University of Maine System,,N
10,P08.2013.000226,Aug 5 2013 12:00AM,61766EBBFF364DD491597C9B2A54F3C3,Proposal,,Portland Hilton Energy Audit,NaT,Portland Hilton,,N
12,P01.2013.007754,Jul 1 2013 12:00AM,TLARSON1231196287172,Proposal,,Christ the King Catholic Church Engineering Co...,NaT,Grosvenor Park III,,N


In [36]:
df1[df1['WBS1'] == 'R01.2012.007235']

Unnamed: 0,WBS1,StartDate,ClientID,Name,ContractDate,ProjectName,EndDate,ClientName,SpecialtyType,Recommend
0,R01.2012.007235,Aug 13 2012 12:00AM,CONGBETHEM,Additional Services,Aug 3 2012 12:00AM,Congregation Beth Emeth Reserve Study Update,NaT,Congregation Beth Emeth,,N
49662,R01.2012.007235,Aug 13 2012 12:00AM,CONGBETHEM,Reserve Study Update - Level III,Aug 3 2012 12:00AM,Congregation Beth Emeth Reserve Study Update,NaT,Congregation Beth Emeth,,N


In [37]:
df1.drop_duplicates(subset=['WBS1'])

Unnamed: 0,WBS1,StartDate,ClientID,Name,ContractDate,ProjectName,EndDate,ClientName,SpecialtyType,Recommend
0,R01.2012.007235,Aug 13 2012 12:00AM,CONGBETHEM,Additional Services,Aug 3 2012 12:00AM,Congregation Beth Emeth Reserve Study Update,NaT,Congregation Beth Emeth,,N
4,R01.2013.007643,Mar 9 2013 12:00AM,MLEEMAN1248814353279,AREA-DC Portfolio,Mar 1 2013 12:00AM,(CONFIDENTIAL)TIAA-CREF Master Services Agreement,NaT,TIAA-CREF,,N
8,P01.2013.007752,Aug 1 2013 12:00AM,UNIVOFME,Proposal,,University of Maine Business Process Reenginee...,NaT,University of Maine System,,N
10,P08.2013.000226,Aug 5 2013 12:00AM,61766EBBFF364DD491597C9B2A54F3C3,Proposal,,Portland Hilton Energy Audit,NaT,Portland Hilton,,N
12,P01.2013.007754,Jul 1 2013 12:00AM,TLARSON1231196287172,Proposal,,Christ the King Catholic Church Engineering Co...,NaT,Grosvenor Park III,,N
...,...,...,...,...,...,...,...,...,...,...
80895,P01.2023.010035,Dec 1 2023 12:00AM,B7B146B5AE9F4548A1C0818F25838FBD,Proposal,,Chesterfield County Facility Condition Assessment,NaT,Chesterfield County,,N
80897,P05.2023.001341,Nov 15 2023 12:00AM,MKOLESAR1204240822804,Proposal,,City of Aurora Public Works Department Organiz...,NaT,City of Aurora,,N
80899,P11.2023.000028,Nov 21 2023 12:00AM,3BDDE70EC40147588B71329E226E2F04,Proposal,,Rock Springs Parks And Recreation Department P...,NaT,City of Rock Springs- Parks & Recreation & Dep...,,N
80902,P05.2023.001339,Jan 31 2024 12:00AM,590D1F6FCD8244699A35BA4BDDD506EF,Proposal,,Wyoming Department of Transportation Statewide...,NaT,Wyoming Department of Transportation (WyDOT),,N


In [38]:
# just the top level for now
df1.drop_duplicates(subset=['WBS1'], inplace=True)
# Drop rows where 'WBS1' contains 'OHD'
df1 = df1[~df1['WBS1'].str.contains('OHD')]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop_duplicates(subset=['WBS1'], inplace=True)


In [39]:
### skip to here

#### CAC
Customer Acquisition Cost -- What we need:
- Did we get the project? (matching R & P codes) == ['GotClient']
- **date R number created is when "acquired" == 'DateAcquired'**
- All P#'s created before that date (are there edge cases where the P and R were created together?)
- all labor under that P# or #'s (are there multiple pre-get P#'s and how often?)
- **Determine acquisition labor == 'GetLaborHrs'**
- acquisition labor is not consistently associated with "prop" and project codes esp. further back.
- All BDev labor not assoc. divided evenly between projects.
 - Get OHD labor total for individual BDEV person for each month
 - Then determine active projects in that month (all proposals, not just 'got' ones)
 - identify which projects that person did work on
 - divide evenly between those projects
 - (limit to past 5 years or less perhaps to avoid messiest data)
- 



In [40]:
## May want to find a way to find when we have same contact with different client (ids or names)
## date of acquisition may make one client actually two

### First, need to determine when client is "acquired"

In [43]:
client_list = df_CL['ClientName'].unique()
client_id_list = df_CL['ClientID'].unique()

In [44]:
# Create 'Column A' and 'Column B' based on 'WBS1' to show proposals vs. projects
df_PR['Column A'] = df_PR['WBS1'].str[0] # P or R
df_PR['Column B'] = df_PR['WBS1'].str[1:] # the rest of the code

In [45]:
df_PR['Column A'].unique()

array(['R', 'P', 'O', '0', 'Z'], dtype=object)

In [176]:
df_PR['Name'][df_PR['Column A'] == '0']

Series([], Name: Name, dtype: object)

In [175]:
df_PR['Name'][df_PR['Column A'] == 'Z'].value_counts()

Series([], Name: count, dtype: int64)

In [51]:
# if there's anything useful that can be extracted here it's most likely from the 
# WBS2 codes, but I have no idea what these mean.
df_PR[['WBS1', 'WBS2', 'WBS3', 'Name', 'ProjectName', 'PhaseName', 'EndDate', 'ClientName']][df_PR['Column A'] == 'Z']

Unnamed: 0,WBS1,WBS2,WBS3,Name,ProjectName,PhaseName,EndDate,ClientName
58604,Z99.0000.000000,00-COR,,Miscellaneous Regular,Miscellaneous Regular,Miscellaneous Regular,NaT,"Facility Engineering Associates, PC"
58605,Z99.0000.000000,01-FFX,,Miscellaneous Regular,Miscellaneous Regular,Miscellaneous Regular,NaT,"Facility Engineering Associates, PC"
58606,Z99.0000.000000,02-ANN,,Miscellaneous Regular,Miscellaneous Regular,Miscellaneous Regular,NaT,"Facility Engineering Associates, PC"
58607,Z99.0000.000000,03-DAL,,Miscellaneous Regular,Miscellaneous Regular,Miscellaneous Regular,NaT,"Facility Engineering Associates, PC"
58608,Z99.0000.000000,04-SDG,,Miscellaneous Regular,Miscellaneous Regular,Miscellaneous Regular,NaT,"Facility Engineering Associates, PC"
58609,Z99.0000.000000,05-DEN,,Miscellaneous Regular,Miscellaneous Regular,Miscellaneous Regular,NaT,"Facility Engineering Associates, PC"
58610,Z99.0000.000000,06-GRN,,Miscellaneous Regular,Miscellaneous Regular,Miscellaneous Regular,NaT,"Facility Engineering Associates, PC"
58611,Z99.0000.000000,07-SEA,,Miscellaneous Regular,Miscellaneous Regular,Miscellaneous Regular,NaT,"Facility Engineering Associates, PC"
58612,Z99.0000.000000,08-SRO,,Miscellaneous Regular,Miscellaneous Regular,Miscellaneous Regular,NaT,"Facility Engineering Associates, PC"
58614,ZZZ.0000.000000,00-COR,,Miscellaneous Overhead,Miscellaneous Overhead,Miscellaneous Overhead,NaT,"Facility Engineering Associates, PC"


In [52]:
# slices for R codes and P codes
p_slice = df_PR[df_PR['Column A'] == 'P']
r_slice = df_PR[df_PR['Column A'] == 'R']
ohd_slice = df_PR[df_PR['Column A'] == 'O']

# Get unique values of 'Column B' for each slice
unique_P_codes = p_slice['Column B'].unique().tolist()
unique_R_codes = r_slice['Column B'].unique().tolist()

# we need to find the intersection of those two lists of unique values.
shared_codes = [value for value in unique_P_codes if value in unique_R_codes]

print(f"Unique P Codes: {len(unique_P_codes)}")
print(f"Unique R Codes: {len(unique_R_codes)}")
print(f"Shared Codes: {len(shared_codes)}")
print(f"OHD Codes: {len(ohd_slice)}")
print(f"Unique WBS1 Codes in original table: {len(df_PR['WBS1'].unique())}")

Unique P Codes: 10435
Unique R Codes: 4930
Shared Codes: 4318
OHD Codes: 3257
Unique WBS1 Codes in original table: 15532


The codes that are shared by both sets are the projects we "got", but what about the others?  

Those that have P codes but no R aren't necessarily not-got -- Check if they were somehow combined with a different WBS1, transferred or billed under a different code; that seems like a likely exception case.  

Also, check if there are R codes with no associated P.  There shouldn't be, but somehow I have a feeling there are.

In [53]:
# this shows there are 164 codes in the og table that aren't P or R codes.
# 15529 - 10435 - 4930 = 164
len(ohd_slice['WBS1'].unique())
# which is the OHD codes, so we're good.

164

Looks like those Z and 000 codes are for "General Administrative", Misc. Regular and Misc. Overhead.

We can drop those because they will have no labor associated with the WBS1.  I'm guessing there's some way other WBS1 codes are associated with this?  Unclear.  Investigate them further later if there is time.

In [54]:
# set dataframe as itself where 0 / Z is not the first char in the Column A string
# in other words, remove all 000. and ZZZ. codes rows from the df
df_PR = df_PR[~df_PR['Column A'].str[0].isin(['0','Z'])]

In [68]:
# some double-checking: something was screwy earlier and now I can't remember what.  
# It seems to be resolved now (or quite possibly I was looking at something wrong and 
# have since sorted it out), but I'll leave this in just in case it comes up again.

In [56]:
df_PR['Column A'].value_counts()

Column A
R    23741
P    15012
O     3257
Name: count, dtype: int64

In [58]:
# reading in the unmodified copy of PR for checking against the active copy df_PR
df_PR_OG = pd.read_pickle('Vision_Data/working_data/PR_WIPCOPY_1220.pkl')

In [63]:
print(f"{len(df_PR_OG[df_PR_OG['WBS1'].str[0].isin(['0','Z'])])} rows with Z or 0 codes.")
# 19 rows with Z / 0 codes
unique_z0s = df_PR_OG['WBS1'][df_PR_OG['WBS1'].str[0].isin(['0','Z'])].unique()
print(f"{len(unique_z0s)} unique codes, which are:\n{unique_z0s}")

19 rows with Z or 0 codes.
3 unique codes, which are:
['000.BMOP.009FES' 'Z99.0000.000000' 'ZZZ.0000.000000']


In [67]:
# len(df_PR['WBS1'].unique()) # 15529 unique WBS1 codes in active PR df
# len(df_PR_OG['WBS1'].unique()) # 15532 unique WBS1 codes in original PR df

# that's 3 codes, as above, the ones we dropped, so that makes sense
# confirming
set(df_PR_OG['WBS1'].unique()) - set(df_PR['WBS1'].unique())
# seems like we're fine

{'000.BMOP.009FES', 'Z99.0000.000000', 'ZZZ.0000.000000'}

There are 4318 WBS1 codes that are repeated with both a "P" and "R" variant.

What I want to do is find only the P's that also have R's, and mark them as "won".

I'll also try to confirm quickly that there are only exactly 2 of each, 1 P, 1 R, and never more, and also for any "orphan" R codes.

In [70]:
# Unique P Codes: 10435
# Unique R Codes: 4930
# Shared Codes: 4318
# OHD Codes: 3257
print(f"Unique P Codes: {len(unique_P_codes)}")
print(f"Unique R Codes: {len(unique_R_codes)}")
print(f"Shared Codes (P&R): {len(shared_codes)}")
print(f"OHD Codes: {len(ohd_slice)}")
print(f"Unique WBS1 Codes in original table: {len(df_PR['WBS1'].unique())}")

Unique P Codes: 10435
Unique R Codes: 4930
Shared Codes (P&R): 4318
OHD Codes: 3257
Unique WBS1 Codes in original table: 15529


In [73]:
# find common vals between slices
common_values = set(unique_P_codes) & set(unique_R_codes)

print(f"{len(common_values)} common values - expected {len(shared_codes)}")
# len(common_values) # 4318 common values, as expected

4318 common values - expected 4318


That would seem to confirm we have no cases with more than 2 copies of the same code.  But with 4930 unique R codes and 4318 shared codes, that leaves 612 "orphaned" R codes.  I'll investigate further.

In [79]:
len(unique_R_codes) - len(shared_codes)

612

In [81]:
## investigate orphan R codes

orphan_Rs = list(set(unique_R_codes) - set(shared_codes))
print(f"Found {len(orphan_Rs)} orphaned R codes")

Found 612 orphaned R codes


In [111]:
# print them as nicely as possible
# from https://stackoverflow.com/questions/1524126/how-to-print-a-list-more-nicely, may wanna use that again
for a,b,c,d,e in zip(orphan_Rs[::5],orphan_Rs[1::5],orphan_Rs[2::5],orphan_Rs[3::5],orphan_Rs[4::5]):
    print('{:<25}{:<25}{:<25}{:<25}{:<}'.format(a,b,c,d,e))

10.2019.000263           01.2002.003174           01.2001.002547           01.2001.002847           01.2015.861505
01.1999.01566B           01.2001.02790E           01.2001.02639V           01.2002.003241           01.2002.003108
01.2001.002750           06.2004.000030           01.2002.003231           01.2001.02790X           01.2001.02722T
01.2000.02166R           01.2001.002526           01.2002.003172           01.1999.01779C           01.2011.715708
05.2015.986001           01.2002.003245           01.2004.04009A           01.2002.003274           01.2000.002063
01.2002.003211           01.2000.02107A           01.2001.002715           01.2001.02750D           01.2001.002858
01.2001.02790C           01.2003.003659           01.2011.715702           10.2018.000225           01.2002.02916E
01.2011.715707           01.2002.003290           01.1999.01874Z           01.2001.002634           10.2018.000217
01.1997.00983C           01.2002.003217           01.1998.01496E           01.19

In [112]:
# Dictionary to store substring counts
substring_counts = {}

# Extract substrings from position 4 to position 8 and count occurrences
for s in orphan_Rs:
    substring = s[3:8]
    substring_counts[substring] = substring_counts.get(substring, 0) + 1

# Print the counts in sorted order by substring
for substring, count in sorted(substring_counts.items()):
    print(f"Substring {substring}: {count} occurrences")

Substring 1997.: 11 occurrences
Substring 1998.: 10 occurrences
Substring 1999.: 28 occurrences
Substring 2000.: 62 occurrences
Substring 2001.: 146 occurrences
Substring 2002.: 163 occurrences
Substring 2003.: 39 occurrences
Substring 2004.: 21 occurrences
Substring 2005.: 19 occurrences
Substring 2006.: 21 occurrences
Substring 2007.: 11 occurrences
Substring 2008.: 4 occurrences
Substring 2009.: 4 occurrences
Substring 2010.: 1 occurrences
Substring 2011.: 14 occurrences
Substring 2015.: 20 occurrences
Substring 2017.: 1 occurrences
Substring 2018.: 30 occurrences
Substring 2019.: 4 occurrences
Substring 2020.: 3 occurrences


In [296]:
# pare down to years and make them ints
substring_list = list(substring_counts.keys())
substring_list = [i[0:4] for i in substring_list]
substring_list = [int(i) for i in substring_list]

# Get the most recent only
recent_orphans = [s for s in orphan_Rs if int(s[3:7]) > 2015]

In [297]:
len(recent_orphans)

38

In [298]:
recent_orphans

['10.2019.000263',
 '10.2018.000225',
 '10.2018.000217',
 '10.2018.000209',
 '01.2018.009287',
 '10.2018.000171',
 '10.2018.000184',
 '10.2018.000192',
 '10.2019.000336',
 '01.2020.959400',
 '10.2018.000212',
 '10.2018.000181',
 '10.2019.000273',
 '10.2018.000175',
 '10.2018.000201',
 '01.2018.935500',
 '10.2018.000179',
 '10.2018.000195',
 '10.2018.000183',
 '10.2018.000188',
 '10.2018.000194',
 '10.2018.000200',
 '10.2018.000178',
 '10.2018.000177',
 '10.2019.000264',
 '01.2017.009137',
 '10.2018.000248',
 '10.2018.000191',
 '10.2018.000196',
 '10.2018.000202',
 '10.2020.000388',
 '10.2018.000210',
 '10.2018.000220',
 '10.2020.000386',
 '10.2018.000199',
 '10.2018.000238',
 '10.2018.000180',
 '10.2018.000215']

In [189]:
df_PR[df_PR['Column B'] == '10.2020.000388']

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B,GetDate
64534,R10.2020.000388,1,1,Feb 17 2021 12:00AM,A5BE976836624301B1837828423516C5,Webinar,,Vector Solutions Physical Security Webinar,Webinar,NaT,Vector Solutions,,N,R,10.2020.000388,
66594,R10.2020.000388,2,1,Feb 17 2021 12:00AM,A5BE976836624301B1837828423516C5,"Webinar - May 10, 2022",,Vector Solutions Physical Security Webinar,"Webinar - May 10, 2022",NaT,Vector Solutions,,N,R,10.2020.000388,
79981,R10.2020.000388,3,1,Feb 17 2021 12:00AM,A5BE976836624301B1837828423516C5,Whitepaper/Guide,,Vector Solutions Physical Security Webinar,Whitepaper/Guide,NaT,Vector Solutions,,N,R,10.2020.000388,


In [190]:
# most are old, within a few years of '00, but some are fairly recent.  I'll take a look at 
# samples from each group

df_PR[df_PR['WBS1'].str.contains("01.1999.01566B", na=False)] # Urban Engineering, Columbia Pike Trail
df_PR[df_PR['WBS1'].str.contains("01.2001.002715", na=False)] # Armstrong Management, Monument Place
df_PR[df_PR['WBS1'].str.contains("01.2011.715708", na=False)] # 3T International, GSA Inventory Quality Assurance Services, Region 08
df_PR[df_PR['WBS1'].str.contains("05.2015.986001", na=False)] # NPS, CAC BPA 2015 Call Order 140P2019F0280
df_PR[df_PR['WBS1'].str.contains("10.2019.000263", na=False)] # St. Chrysostom's Day School Physical Security
# ['ProjectName']  .iloc[0]

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B,GetDate
3854,R05.2015.986001,001,010,Sep 16 2019 12:00AM,NPS,AMIS Amistad NRA,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3857,R05.2015.986001,001,016,Sep 16 2019 12:00AM,NPS,WHIS Whiskeytown NRA,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3858,R05.2015.986001,001,017,Sep 16 2019 12:00AM,NPS,LAVO Lasson Volcanic NP,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3859,R05.2015.986001,001,018,Sep 16 2019 12:00AM,NPS,NACE/GWMP National Capital Parks East / George...,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3860,R05.2015.986001,001,019,Sep 16 2019 12:00AM,NPS,ORPI Organ Pipe Cactus NM,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70195,R05.2015.986001,012,034,Sep 16 2019 12:00AM,NPS,Park #33,,National Park Service CAC BPA 2015 - Call Orde...,Project Management,NaT,National Park Service,,N,R,05.2015.986001,
70196,R05.2015.986001,012,035,Sep 16 2019 12:00AM,NPS,Park #34,,National Park Service CAC BPA 2015 - Call Orde...,Project Management,NaT,National Park Service,,N,R,05.2015.986001,
70197,R05.2015.986001,012,036,Sep 16 2019 12:00AM,NPS,Park #35,,National Park Service CAC BPA 2015 - Call Orde...,Project Management,NaT,National Park Service,,N,R,05.2015.986001,
70198,R05.2015.986001,013,035,Sep 16 2019 12:00AM,NPS,EDIS Thomas Edison NHP,,National Park Service CAC BPA 2015 - Call Orde...,CHOH (PSA) Chesapeake & Ohio Canal National Hi...,NaT,National Park Service,,N,R,05.2015.986001,


In [219]:
df_PR['WBS1'][df_PR['ProjectName'].str.contains("CAC BPA", na=False)].value_counts()

WBS1
R05.2015.986001    520
R05.2015.000986     87
R05.2015.986002     72
P05.2015.000986     31
P03.2014.000739      1
Name: count, dtype: int64

In [None]:
temp_slice = df_PR[df_PR['ProjectName'].str.contains("CAC BPA", na=False)]
temp_slice['Column B'][temp_slice['Column B'].str.contains("986001", na=False)].iloc[0]

In [214]:
df_PR[df_PR['Column B'].str.contains("986001", na=False)]

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B,GetDate
3854,R05.2015.986001,001,010,Sep 16 2019 12:00AM,NPS,AMIS Amistad NRA,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3857,R05.2015.986001,001,016,Sep 16 2019 12:00AM,NPS,WHIS Whiskeytown NRA,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3858,R05.2015.986001,001,017,Sep 16 2019 12:00AM,NPS,LAVO Lasson Volcanic NP,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3859,R05.2015.986001,001,018,Sep 16 2019 12:00AM,NPS,NACE/GWMP National Capital Parks East / George...,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3860,R05.2015.986001,001,019,Sep 16 2019 12:00AM,NPS,ORPI Organ Pipe Cactus NM,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70195,R05.2015.986001,012,034,Sep 16 2019 12:00AM,NPS,Park #33,,National Park Service CAC BPA 2015 - Call Orde...,Project Management,NaT,National Park Service,,N,R,05.2015.986001,
70196,R05.2015.986001,012,035,Sep 16 2019 12:00AM,NPS,Park #34,,National Park Service CAC BPA 2015 - Call Orde...,Project Management,NaT,National Park Service,,N,R,05.2015.986001,
70197,R05.2015.986001,012,036,Sep 16 2019 12:00AM,NPS,Park #35,,National Park Service CAC BPA 2015 - Call Orde...,Project Management,NaT,National Park Service,,N,R,05.2015.986001,
70198,R05.2015.986001,013,035,Sep 16 2019 12:00AM,NPS,EDIS Thomas Edison NHP,,National Park Service CAC BPA 2015 - Call Orde...,CHOH (PSA) Chesapeake & Ohio Canal National Hi...,NaT,National Park Service,,N,R,05.2015.986001,


In [215]:
df_PR[df_PR['WBS1'] == "R05.2015.986001"]

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B,GetDate
3854,R05.2015.986001,001,010,Sep 16 2019 12:00AM,NPS,AMIS Amistad NRA,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3857,R05.2015.986001,001,016,Sep 16 2019 12:00AM,NPS,WHIS Whiskeytown NRA,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3858,R05.2015.986001,001,017,Sep 16 2019 12:00AM,NPS,LAVO Lasson Volcanic NP,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3859,R05.2015.986001,001,018,Sep 16 2019 12:00AM,NPS,NACE/GWMP National Capital Parks East / George...,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
3860,R05.2015.986001,001,019,Sep 16 2019 12:00AM,NPS,ORPI Organ Pipe Cactus NM,,National Park Service CAC BPA 2015 - Call Orde...,AMIS Amistad NRA,NaT,National Park Service,,N,R,05.2015.986001,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70195,R05.2015.986001,012,034,Sep 16 2019 12:00AM,NPS,Park #33,,National Park Service CAC BPA 2015 - Call Orde...,Project Management,NaT,National Park Service,,N,R,05.2015.986001,
70196,R05.2015.986001,012,035,Sep 16 2019 12:00AM,NPS,Park #34,,National Park Service CAC BPA 2015 - Call Orde...,Project Management,NaT,National Park Service,,N,R,05.2015.986001,
70197,R05.2015.986001,012,036,Sep 16 2019 12:00AM,NPS,Park #35,,National Park Service CAC BPA 2015 - Call Orde...,Project Management,NaT,National Park Service,,N,R,05.2015.986001,
70198,R05.2015.986001,013,035,Sep 16 2019 12:00AM,NPS,EDIS Thomas Edison NHP,,National Park Service CAC BPA 2015 - Call Orde...,CHOH (PSA) Chesapeake & Ohio Canal National Hi...,NaT,National Park Service,,N,R,05.2015.986001,


In [303]:
df_PR[df_PR['WBS1'] == "P05.2015.000986"]

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B,GetDate
2036,P05.2015.000986,PROP04,PROP,Oct 1 2015 12:00AM,NPS,Proposal,,National Park Service CAC BPA 2015 (Original) ...,Proposal,NaT,National Park Service,,N,P,05.2015.000986,2222-01-01 00:00:00
2046,P05.2015.000986,PROP05,PROP,Oct 1 2015 12:00AM,NPS,Proposal,,National Park Service CAC BPA 2015 (Original) ...,Proposal,NaT,National Park Service,,N,P,05.2015.000986,2222-01-01 00:00:00
2048,P05.2015.000986,PROP06,PROP,Oct 1 2015 12:00AM,NPS,Proposal,,National Park Service CAC BPA 2015 (Original) ...,Proposal,NaT,National Park Service,,N,P,05.2015.000986,2222-01-01 00:00:00
2566,P05.2015.000986,PROP12,PROP,Oct 1 2015 12:00AM,NPS,Proposal,,National Park Service CAC BPA 2015 (Original) ...,Proposal,NaT,National Park Service,,N,P,05.2015.000986,2222-01-01 00:00:00
2581,P05.2015.000986,PROP13,PROP,Oct 1 2015 12:00AM,NPS,Proposal,,National Park Service CAC BPA 2015 (Original) ...,Proposal,NaT,National Park Service,,N,P,05.2015.000986,2222-01-01 00:00:00
3696,P05.2015.000986,PROP11,PROP,Oct 1 2015 12:00AM,NPS,Proposal,,National Park Service CAC BPA 2015 (Original) ...,Proposal,NaT,National Park Service,,N,P,05.2015.000986,2222-01-01 00:00:00
4467,P05.2015.000986,PROP,PROP,Oct 1 2015 12:00AM,NPS,Proposal,,National Park Service CAC BPA 2015 (Original) ...,Proposal,NaT,National Park Service,,N,P,05.2015.000986,2222-01-01 00:00:00
59241,P05.2015.000986,PROP31,PROP,Oct 1 2015 12:00AM,NPS,Proposal,,National Park Service CAC BPA 2015 (Original) ...,Proposal,NaT,National Park Service,,N,P,05.2015.000986,2222-01-01 00:00:00
63510,P05.2015.000986,PROP02,PROP,Oct 1 2015 12:00AM,NPS,Proposal,,National Park Service CAC BPA 2015 (Original) ...,Proposal,NaT,National Park Service,,N,P,05.2015.000986,2222-01-01 00:00:00
63565,P05.2015.000986,PROP03,PROP,Oct 1 2015 12:00AM,NPS,Proposal,,National Park Service CAC BPA 2015 (Original) ...,Proposal,NaT,National Park Service,,N,P,05.2015.000986,2222-01-01 00:00:00


In [222]:
# ask Ryan about this ^^
# most likely people just skipped steps
# 
# because it's a BPA, we needed to account for all tasks individually
# so kevin rewrote project number to separate from  original project number 000986
# so, R05.2015.986001 corresponds to P05.2015.000986
# get short list together and then get a bit of Kevin's time to discuss in a call

In [245]:
df_PR.columns

Index(['WBS1', 'WBS2', 'WBS3', 'StartDate', 'ClientID', 'Name', 'ContractDate',
       'ProjectName', 'PhaseName', 'EndDate', 'ClientName', 'SpecialtyType',
       'Recommend', 'Column A', 'Column B', 'GetDate'],
      dtype='object')

In [247]:
r_slice['ContractDate'].isna().sum()

7086

In [249]:
r_slice['WBS1'].unique()[110]

'R01.2014.008233'

In [251]:
r_slice['ContractDate'].value_counts()

ContractDate
Dec  1 2011 12:00AM    963
Aug 27 2009 12:00AM    283
Sep 15 2015 12:00AM    220
Nov 17 2008 12:00AM    115
Oct 11 2001 12:00AM    113
                      ... 
Jun 19 2015 12:00AM      1
Jun 25 2015 12:00AM      1
Jan 18 2010 12:00AM      1
Jun 25 2012 12:00AM      1
Dec 30 2009 12:00AM      1
Name: count, Length: 2419, dtype: int64

In [250]:
r_slice[r_slice['WBS1'] == 'R01.2014.008233']

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B
860,R01.2014.008233,1,1,Oct 1 2014 12:00AM,FANNIEMAE,Review Existing contract - services,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233
861,R01.2014.008233,1,2,Oct 1 2014 12:00AM,FANNIEMAE,Develop SOW for RFP,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233
862,R01.2014.008233,1,3,Oct 1 2014 12:00AM,FANNIEMAE,Evaluation template/process,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233
863,R01.2014.008233,1,4,Oct 1 2014 12:00AM,FANNIEMAE,Review all proposals,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233
864,R01.2014.008233,1,5,Oct 1 2014 12:00AM,FANNIEMAE,Participate in Interviews,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233
865,R01.2014.008233,1,6,Oct 1 2014 12:00AM,FANNIEMAE,Transition Management,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233
3981,R01.2014.008233,1,999,Oct 1 2014 12:00AM,FANNIEMAE,Reimbursable Expenses,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233
65118,R01.2014.008233,2,1,Oct 1 2014 12:00AM,FANNIEMAE,Transition Management Support - Phase 2,,"Fannie Mae O&M Service Provider Evaluation, Se...",Transition Management Support - Phase 2,NaT,Fannie Mae,,N,R,01.2014.008233


In [269]:
blank_startdates = (df_PR['WBS1'].unique() - df_PR['WBS1'][~df_PR['StartDate'].isna()].unique())

ValueError: operands could not be broadcast together with shapes (15529,) (13601,) 

In [261]:
df_PR[df_PR['WBS1'] == 'R01.2015.861501']

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B,GetDate
2066,R01.2015.861501,1,5,,GENERALSRV,Annual Update,Sep 19 2016 12:00AM,Federal Buildings Personnel Training Act Suppo...,Annual Update,NaT,U.S. General Services Administration,,N,R,01.2015.861501,
2278,R01.2015.861501,1,7,,GENERALSRV,Agency Implementation Outreach Services,Sep 19 2016 12:00AM,Federal Buildings Personnel Training Act Suppo...,Annual Update,NaT,U.S. General Services Administration,,N,R,01.2015.861501,
2285,R01.2015.861501,1,6,,GENERALSRV,Additional PM and Meeting,Sep 19 2016 12:00AM,Federal Buildings Personnel Training Act Suppo...,Annual Update,NaT,U.S. General Services Administration,,N,R,01.2015.861501,
58919,R01.2015.861501,1,1,,GENERALSRV,FBPTA Industry Day,Sep 19 2016 12:00AM,Federal Buildings Personnel Training Act Suppo...,Annual Update,NaT,U.S. General Services Administration,,N,R,01.2015.861501,
58920,R01.2015.861501,1,2,,GENERALSRV,Advisory Council 4 Meetings,Sep 19 2016 12:00AM,Federal Buildings Personnel Training Act Suppo...,Annual Update,NaT,U.S. General Services Administration,,N,R,01.2015.861501,
58937,R01.2015.861501,2,2,,GENERALSRV,Internal Reviews,Sep 19 2016 12:00AM,Federal Buildings Personnel Training Act Suppo...,Internal Reviews,NaT,U.S. General Services Administration,,N,R,01.2015.861501,
58938,R01.2015.861501,2,3,,GENERALSRV,Tier 1,Sep 19 2016 12:00AM,Federal Buildings Personnel Training Act Suppo...,Internal Reviews,NaT,U.S. General Services Administration,,N,R,01.2015.861501,
58939,R01.2015.861501,2,4,,GENERALSRV,Tier 2,Sep 19 2016 12:00AM,Federal Buildings Personnel Training Act Suppo...,Internal Reviews,NaT,U.S. General Services Administration,,N,R,01.2015.861501,
58940,R01.2015.861501,2,5,,GENERALSRV,Tier 3,Sep 19 2016 12:00AM,Federal Buildings Personnel Training Act Suppo...,Internal Reviews,NaT,U.S. General Services Administration,,N,R,01.2015.861501,
58941,R01.2015.861501,2,6,,GENERALSRV,Tier 4,Sep 19 2016 12:00AM,Federal Buildings Personnel Training Act Suppo...,Internal Reviews,NaT,U.S. General Services Administration,,N,R,01.2015.861501,


In [255]:
df_PR[df_PR['WBS1'] == 'R01.2014.008233']

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B,GetDate
860,R01.2014.008233,1,1,Oct 1 2014 12:00AM,FANNIEMAE,Review Existing contract - services,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233,
861,R01.2014.008233,1,2,Oct 1 2014 12:00AM,FANNIEMAE,Develop SOW for RFP,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233,
862,R01.2014.008233,1,3,Oct 1 2014 12:00AM,FANNIEMAE,Evaluation template/process,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233,
863,R01.2014.008233,1,4,Oct 1 2014 12:00AM,FANNIEMAE,Review all proposals,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233,
864,R01.2014.008233,1,5,Oct 1 2014 12:00AM,FANNIEMAE,Participate in Interviews,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233,
865,R01.2014.008233,1,6,Oct 1 2014 12:00AM,FANNIEMAE,Transition Management,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233,
3981,R01.2014.008233,1,999,Oct 1 2014 12:00AM,FANNIEMAE,Reimbursable Expenses,,"Fannie Mae O&M Service Provider Evaluation, Se...",Review Existing contract - services,NaT,Fannie Mae,,N,R,01.2014.008233,
65118,R01.2014.008233,2,1,Oct 1 2014 12:00AM,FANNIEMAE,Transition Management Support - Phase 2,,"Fannie Mae O&M Service Provider Evaluation, Se...",Transition Management Support - Phase 2,NaT,Fannie Mae,,N,R,01.2014.008233,


In [113]:
# Filter the original DataFrame based on common values in 'Column B'
common_val_slice = df_PR[df_PR['Column B'].isin(common_values)]
# common_val_slice
# 23498 rows with common values

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B
0,R01.2012.007235,002,001,Aug 13 2012 12:00AM,CONGBETHEM,Additional Services,Aug 3 2012 12:00AM,Congregation Beth Emeth Reserve Study Update,Additional Services,NaT,Congregation Beth Emeth,,N,R,01.2012.007235
4,R01.2013.007643,002,001,Mar 9 2013 12:00AM,MLEEMAN1248814353279,AREA-DC Portfolio,Mar 1 2013 12:00AM,(CONFIDENTIAL)TIAA-CREF Master Services Agreement,AREA-DC Portfolio,NaT,TIAA-CREF,,N,R,01.2013.007643
8,P01.2013.007752,PROP,PROP,Aug 1 2013 12:00AM,UNIVOFME,Proposal,,University of Maine Business Process Reenginee...,Proposal,NaT,University of Maine System,,N,P,01.2013.007752
14,R09.2011.000224,002,001,Nov 7 2011 12:00AM,RGUTIERREZ1308341176806,Changes to Study,Nov 30 2011 12:00AM,Kingston Plantation – Margate Tower Reserve St...,Changes to Study,NaT,FelCor Lodging Trust,,N,R,09.2011.000224
20,P08.2013.000231,PROP,PROP,Jun 6 2013 12:00AM,A6F22F37CAB94F7686A4A9CF09C42E1C,Proposal,,Azteca Market Mechanical Systems Design & Ener...,Proposal,NaT,Peter J Collins Architects,,N,P,08.2013.000231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80877,R01.2023.010037,001,001,Nov 6 2023 12:00AM,GWU,Inventory and Report,,George Washington University Automatic Door Op...,Inventory and Report,NaT,The George Washington University,,N,R,01.2023.010037
80879,R01.2018.009240,009,001,Apr 9 2018 12:00AM,CFM MGMT,Fieldwork & Reporting,,Chelsea Tower Condominium Balcony Assessment,Fieldwork & Reporting,NaT,"CFM Management Services, Inc.",,N,R,01.2018.009240
80883,R01.2022.009854,001,004,Mar 15 2022 12:00AM,TOWERVILLA,Construction Contract Administration,,Tower Villas Engineering Consulting Services,Garage Slab Analysis,NaT,Tower Villas Condominium,,N,R,01.2022.009854
80890,P01.2023.010033,PROP,PROP,Nov 1 2023 12:00AM,MWEAVER1279714793151,Proposal,,Salisbury University Staffing Analysis 2023,Proposal,NaT,Salisbury University,,N,P,01.2023.010033


In [115]:
## just making sure we've accounted for all other stuff now
# len(common_val_slice['WBS1'].unique()) # 8636 unique WBS1 codes
# len(common_val_slice['WBS1'][common_val_slice['Column A'] == 'P'].unique()) # 4889 total / 4318 unique
# len(common_val_slice['WBS1'][common_val_slice['Column A'] == 'R'].unique()) # 18609 / 4318 unique

print(f"There are {len(common_val_slice['WBS1'].unique())} unique WBS1 codes in the slice.")
print(f"There are {len(common_val_slice['Column B'].unique())} unique codes (only code) in the slice.")
print(f"8636 / 4318 = {8636 / 4318} -- This is exactly half, which is perfect.\n")
print(f"There are {len(df_PR['WBS1'].unique())} unique WBS1 codes in the full dataframe.")
# 15365 = 10435(R) + 4930(P) + 164(OHD)
print(f"There are {len(df_PR['Column B'].unique())} unique codes (only code) in the full dataframe.")
print(f"Subtract (15529 - 11211) and we get {15529 - 11211} again, the number of repeats.  All is well.")

There are 8636 unique WBS1 codes in the slice.
There are 4318 unique codes (only code) in the slice.
8636 / 4318 = 2.0 -- This is exactly half, which is perfect.

There are 15529 unique WBS1 codes in the full dataframe.
There are 11211 unique codes (only code) in the full dataframe.
Subtract (15529 - 11211) and we get 4318 again, the number of repeats.  All is well.


The 23,498 rows above represent the slice of P-codes and their associated R-codes.  In other words, the jobs we got, from a proposal, and the proposal that led to them.

We see exactly the same number of P and R codes in our common_val_slice, and two occurrences for each unique Column B code -- just what we wanted and expected.

Next I'll want to look at sub-slices for individual clients, and then at dates.  There is no need to mark clients as "got" or not before this stage, because if they weren't, they simply won't show up in the 'common_val' slice.  We can go straight from there to getting the date.

In [119]:
## I'm going to add these directly to df_PR at the beginning.  When I mark the 
## right ones with dates I can use the common_val_slice to select the correct subset.

# Add the column
df_PR['GetDate'] = None
# All P codes will assume not-got until we assign them as got
df_PR['GetDate'][df_PR['Column A'] == 'P'] = False

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_PR['GetDate'][df_PR['Column A'] == 'P'] = False


In [131]:
# df_PR['WBS1'][df_PR['WBS1'].isin(common_val_slice['WBS1'].unique())].unique().shape
# 8636 rows, this is the correct slice

# if the WBS1 is in the list of all unique WBS1's in common_val_slice, mark them for later.
got_jobs_indices = df_PR[df_PR['WBS1'].isin(common_val_slice['WBS1'].unique())].index

# now strip it to only the P#'s'
got_jobs_indices = got_jobs_indices[df_PR.loc[got_jobs_indices, 'Column A'] == 'P']

# finally, assign a placeholder value for getdate
df_PR.loc[got_jobs_indices, 'GetDate'] = pd.to_datetime('2222-01-01')

In [223]:
print(f"{df_PR['GetDate'].value_counts()[0]} False: P's with no associated R;\n\
{df_PR['GetDate'].value_counts()[1]} placeholder datetime for got jobs; and\n\
{df_PR['GetDate'].isna().sum()} nulls: all R#s")

10123 False: P's with no associated R;
4889 placeholder datetime for got jobs; and
26998 nulls: all R#s


  print(f"{df_PR['GetDate'].value_counts()[0]} False: P's with no associated R;\n\
  {df_PR['GetDate'].value_counts()[1]} placeholder datetime for got jobs; and\n\


In [228]:
df_PR['GetDate'].iloc[8]

Timestamp('2222-01-01 00:00:00')

In [232]:
temp_slice = df_PR[df_PR['GetDate'] == df_PR['GetDate'].iloc[8]]
len(temp_slice['WBS1'].unique())

4318

In [235]:
got_jobs_indices[0:10]

Index([8, 20, 22, 24, 32, 41, 47, 120, 172, 189], dtype='int64')

In [236]:
df_PR.loc[20]

WBS1                                               P08.2013.000231
WBS2                                                          PROP
WBS3                                                          PROP
StartDate                                      Jun  6 2013 12:00AM
ClientID                          A6F22F37CAB94F7686A4A9CF09C42E1C
Name                                                      Proposal
ContractDate                                                   NaN
ProjectName      Azteca Market Mechanical Systems Design & Ener...
PhaseName                                                 Proposal
EndDate                                                        NaT
ClientName                              Peter J Collins Architects
SpecialtyType                                                  NaN
Recommend                                                        N
Column A                                                         P
Column B                                            08.2013.00

In [243]:
new_index_list = [i for i in range(0, len(got_jobs_indices)) if len(df_PR[df_PR['WBS1'] == df_PR['WBS1'].loc[got_jobs_indices[i]]]) > 1]
len(new_index_list)

805

In [244]:
new_index_list[0]

8

In [239]:
df_PR[df_PR['WBS1'] == df_PR['WBS1'].loc[got_jobs_indices[2]]]

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B,GetDate
22,P01.2013.007753,PROP,PROP,Jun 10 2013 12:00AM,GDOWREY1204373827790,Proposal,,Columbia St. Mary's Hospital Fall Protection C...,Proposal,NaT,Duke Realty Corporation,,N,P,01.2013.007753,2222-01-01 00:00:00


In [123]:
# filtered_indexes_p = 


Index([    8,    20,    22,    24,    32,    41,    47,   120,   172,   189,
       ...
       80707, 80712, 80724, 80748, 80754, 80761, 80775, 80831, 80890, 80902],
      dtype='int64', length=4889)

In [129]:
df_PR.loc[22]

WBS1                                               P01.2013.007753
WBS2                                                          PROP
WBS3                                                          PROP
StartDate                                      Jun 10 2013 12:00AM
ClientID                                      GDOWREY1204373827790
Name                                                      Proposal
ContractDate                                                   NaN
ProjectName      Columbia St. Mary's Hospital Fall Protection C...
PhaseName                                                 Proposal
EndDate                                                        NaT
ClientName                                 Duke Realty Corporation
SpecialtyType                                                  NaN
Recommend                                                        N
Column A                                                         P
Column B                                            01.2013.00

In [None]:
# df_PR['GetDate'].value_counts()

# df_PR['GetDate'][df_PR['Column A'] == 'P'] = False

In [244]:
df_PR['Column A'][df_PR['GotClient'] == True].value_counts()

Column A
R    18609
P     4889
Name: count, dtype: int64

In [245]:
# now we have "True" values in our R columns, which we want to just be null, obviously we got it if there's an R code
df_PR['GotClient'][df_PR['Column A'] == 'R'] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_PR['GotClient'][df_PR['Column A'] == 'R'] = None


In [246]:
df_PR['Column A'][df_PR['GotClient'] == True].value_counts()
# and that's fixed

Column A
P    4889
Name: count, dtype: int64

#### CHECKPOINT: Jan 10
Now we can move on to getting the dates

In [111]:
repeated_codes

4318

In [91]:
filtered_rows = df_PR['Column B'].unique()
# out of these 11211, which ones are R and which are P
# slim to just the R's
df_PR[df_PR]
df_PR[df_PR['Column B'] ]

array(['01.2012.007235', '01.2013.007643', '01.2013.007752', ...,
       '05.2023.001341', '11.2023.000028', '11.2023.000031'], dtype=object)

In [94]:
print(len(df_PR['WBS1'][df_PR['Column A'] == 'R'].unique()))
print(len(df_PR['WBS1'][df_PR['Column A'] == 'R'].unique()) - )

4930

This is a little weird, why are there more unique R codes than there are repeated R & P codes?  This means 

In [16]:
# Create a list to store the data
temp_list = []

# Iterate over unique values in 'Column B' -- the WBS code minus first
for column_b_value in df_PR['Column B'].unique():
    # Filter rows in the original DataFrame based on 'Column B' value
    filtered_rows = df_PR[df_PR['Column B'] == column_b_value]

    # Check if any row has 'Column A' value of 'R'
    bid_won = 'Y' if 'R' in filtered_rows['Column A'].values else 'N'

    # Get the 'ClientID' from the first row
    client_id = filtered_rows.iloc[0]['ClientID']

    # Append the data to the list
    temp_list.append([column_b_value, bid_won, client_id])

# Create df from the collected data
# df = pd.DataFrame(temp_list, columns=['Column B', 'BidWon', 'ClientID'])

In [17]:
df2.head()

Unnamed: 0,Column B,BidWon,ClientID
0,01.2012.007235,Y,CONGBETHEM
1,01.2013.007643,Y,MLEEMAN1248814353279
2,01.2013.007752,Y,UNIVOFME
3,08.2013.000226,N,61766EBBFF364DD491597C9B2A54F3C3
4,01.2013.007754,N,TLARSON1231196287172


In [18]:
df1['StartDate'].isna().sum()

1946

In [19]:
df_CL.columns

Index(['ClientID', 'ClientName', 'SpecialtyType', 'Recommend',
       'ClientCreated'],
      dtype='object')

Now to get P and R counts seperately as their own columns, added to DF_CL

In [20]:
# getting the count of unique projects for a client
len(df1[df1['ClientID'] == 'UNIVOFME']['WBS1'].unique())

11

In [21]:
df2['R_Count'] = None
df2['P_Count'] = None

In [22]:
# now get separate P and R counts
df_temp = df1[df1['ClientID'] == 'UNIVOFME']
temp_counts = df_temp['Column A'].value_counts()
df2['P_Count'][df2['ClientID'] == 'UNIVOFME'] = temp_counts[0]
df2['R_Count'][df2['ClientID'] == 'UNIVOFME'] = temp_counts[1]

  df2['P_Count'][df2['ClientID'] == 'UNIVOFME'] = temp_counts[0]
  df2['R_Count'][df2['ClientID'] == 'UNIVOFME'] = temp_counts[1]


In [23]:
df2[df2['ClientID'] == 'UNIVOFME']

Unnamed: 0,Column B,BidWon,ClientID,R_Count,P_Count
2,01.2013.007752,Y,UNIVOFME,5,6
536,01.2015.008619,N,UNIVOFME,5,6
1017,01.2014.008308,Y,UNIVOFME,5,6
3209,01.2006.004953,Y,UNIVOFME,5,6
9330,01.2014.008055,Y,UNIVOFME,5,6
9736,01.2023.010031,Y,UNIVOFME,5,6


Now I need to bring in the data from CL, mapped accordingly to ClientIDs

In [24]:
client_list = df1['ClientName'].unique()
client_id_list = df1['ClientID'].unique()

In [25]:
# suppress annoying warning message spam
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('mode.chained_assignment', None)

In [26]:
# define a column for acquisition dates
df2['DateAcquired'] = None

for client_id in client_id_list:
    CL_rows = df2[df2['ClientID'] == client_id]
    # print(len(CL_rows))
    PR_slice = df1[df1['ClientID'] == client_id]
    client_projects = PR_slice['WBS1'].value_counts().index
    # print(client_projects)
    
    R_codes = [project for project in client_projects if "R" in project]
    P_codes = [project for project in client_projects if "P" in project]
    
    result_dict = {}
    for code in R_codes:
        # Filter DataFrame for the current R_code
        one_project = df1[df1['WBS1'] == code]

        # Find the minimum StartDate for the current group
        one_project['StartDate'] = one_project['StartDate'].astype(str)        
        min_start_date = one_project['StartDate'].min()
        # print(f"Client: {client_id}, Code: {code}:\n{min_start_date}")
        ## looks like that's working correctly now
                
        # Store the result in the dictionary
        result_dict[code] = min_start_date
        
        # Get the earliest R code date.
        if not result_dict:
            pass
        else:
            acquire_date = min(result_dict.items(), key=operator.itemgetter(1))
            # next steps?

        # set that value in df_CL as DateAcquired
        df2['DateAcquired'][df2['ClientID'] == client_id] = acquire_date[1]

In [21]:
df1.head()

Unnamed: 0,WBS1,StartDate,ClientID,Name,ContractDate,ProjectName,EndDate,ClientName,SpecialtyType,Recommend,Column A,Column B
0,R01.2012.007235,Aug 13 2012 12:00AM,CONGBETHEM,Additional Services,Aug 3 2012 12:00AM,Congregation Beth Emeth Reserve Study Update,NaT,Congregation Beth Emeth,,N,R,01.2012.007235
4,R01.2013.007643,Mar 9 2013 12:00AM,MLEEMAN1248814353279,AREA-DC Portfolio,Mar 1 2013 12:00AM,(CONFIDENTIAL)TIAA-CREF Master Services Agreement,NaT,TIAA-CREF,,N,R,01.2013.007643
8,P01.2013.007752,Aug 1 2013 12:00AM,UNIVOFME,Proposal,,University of Maine Business Process Reenginee...,NaT,University of Maine System,,N,P,01.2013.007752
10,P08.2013.000226,Aug 5 2013 12:00AM,61766EBBFF364DD491597C9B2A54F3C3,Proposal,,Portland Hilton Energy Audit,NaT,Portland Hilton,,N,P,08.2013.000226
12,P01.2013.007754,Jul 1 2013 12:00AM,TLARSON1231196287172,Proposal,,Christ the King Catholic Church Engineering Co...,NaT,Grosvenor Park III,,N,P,01.2013.007754


In [26]:
### some version of this will give me my 'BidWonDate'
for client in client_list:
    CL_row = df2[df2['ClientID'] == client]
    PR_slice = df_PR[df_PR['ClientID'] == client]
    client_projects = PR_slice['WBS1'].value_counts().index
    
    R_codes = [project for project in client_projects if "R" in project]
    P_codes = [project for project in client_projects if "P" in project]
    
    result_dict = {}
    for code in R_codes:
        # Filter DataFrame for the current R_code
        one_project = df1[df1['WBS1'] == code]

        # Find the minimum StartDate for the current group
        min_start_date = one_project['StartDate'].min()

        # Store the result in the dictionary
        result_dict[code] = min_start_date
        
    # Get the earliest R code date.
    if not result_dict:
        pass
    else:
        acquire_date = min(result_dict.items(), key=lambda x: x[1])
        # rest of your code

    # set that value in df2 as DateAcquired
    df2['DateAcquired'][df2['ClientID'] == client] = acquire_date[1]

TypeError: '<' not supported between instances of 'str' and 'float'

In [28]:
result_dict.items()

dict_items([('R01.2006.004739', nan), ('R01.2008.005932', 'Nov  6 2008 12:00AM')])

The above was my original code to do this, unsure why it doesn't work now.  Working on troubleshooting.

(one-off attempts below)

In [26]:
## Can use these as test cases
# JLL / SSI
# 3TI
# CBRE
# NPS
client_test_list = ['JLL / SSI', '3TI', 'CBRE', 'NPS']

In [28]:
df_CL['DateAcquired'] = None

for client in client_test_list:
    CL_rows = df_CL[df_CL['ClientID'] == client]
    # print(len(CL_rows))
    PR_slice = df_PR[df_PR['ClientID'] == client]
    client_projects = PR_slice['WBS1'].value_counts().index
    # print(client_projects)
    
    R_codes = [project for project in client_projects if "R" in project]
    P_codes = [project for project in client_projects if "P" in project]
    
    result_dict = {}
    for code in R_codes:
        # Filter DataFrame for the current R_code
        one_project = df_PR[df_PR['WBS1'] == code]

        # Find the minimum StartDate for the current group
        one_project['StartDate'] = one_project['StartDate'].astype(str)        
        min_start_date = one_project['StartDate'].min()
        # print(f"Client: {client}, Code: {code}:\n{min_start_date}")
        ## looks like that's working correctly now
                
        # Store the result in the dictionary
        result_dict[code] = min_start_date
        
        # Get the earliest R code date.
        if not result_dict:
            pass
        else:
            acquire_date = min(result_dict.items(), key=operator.itemgetter(1))
            # next steps?

        # set that value in df_CL as DateAcquired
        df_CL['DateAcquired'][df_CL['ClientID'] == client] = acquire_date[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

In [33]:
# df_CL[df_CL['ClientID'] == 'JLL / SSI']
# df_CL[df_CL['ClientID'] == '3TI']
# df_CL[df_CL['ClientID'] == 'CBRE']
# df_CL[df_CL['ClientID'] == 'NPS']

# df_PR[df_PR['ClientID'] == 'JLL / SSI']
# df_PR[df_PR['ClientID'] == '3TI']
# df_PR[df_PR['ClientID'] == 'CBRE']
# df_PR[df_PR['ClientID'] == 'NPS']

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend
125,R05.2012.000820,004,001,Dec 27 2012 12:00AM,JLL / SSI,Design Level Evaluation,Jan 14 2013 12:00AM,200 S Biscayne Southeast Financial Center Engi...,Design Level Evaluation,NaT,Jones Lang LaSalle Incorporated,,
126,R05.2012.000820,004,002,Dec 27 2012 12:00AM,JLL / SSI,Construction Documents,Jan 14 2013 12:00AM,200 S Biscayne Southeast Financial Center Engi...,Design Level Evaluation,NaT,Jones Lang LaSalle Incorporated,,
127,R05.2012.000820,004,003,Dec 27 2012 12:00AM,JLL / SSI,Bidding Services,Jan 14 2013 12:00AM,200 S Biscayne Southeast Financial Center Engi...,Design Level Evaluation,NaT,Jones Lang LaSalle Incorporated,,
128,R05.2012.000820,004,004,Dec 27 2012 12:00AM,JLL / SSI,Contract Administration Services,Jan 14 2013 12:00AM,200 S Biscayne Southeast Financial Center Engi...,Design Level Evaluation,NaT,Jones Lang LaSalle Incorporated,,
129,R05.2012.000820,004,005,Dec 27 2012 12:00AM,JLL / SSI,Fall Protection Load Testing,Jan 14 2013 12:00AM,200 S Biscayne Southeast Financial Center Engi...,Design Level Evaluation,NaT,Jones Lang LaSalle Incorporated,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
78783,R01.2014.008101,001,001,May 5 2014 12:00AM,JLL / SSI,Presentation,,JLL 2014 Engineering Operations Conference,Presentation,NaT,Jones Lang LaSalle Incorporated,,
78871,P08.2015.000340,PROP,PROP,Aug 5 2015 12:00AM,JLL / SSI,Proposal,,JLL 1515 Poydras St. Chiller No.2 Repair Evalu...,Proposal,NaT,Jones Lang LaSalle Incorporated,,
79143,R01.2018.009401,001,001,Nov 26 2018 12:00AM,JLL / SSI,3rd Floor Water Infiltration,Dec 7 2018 12:00AM,9755 Patuxent Woods Drive Engineering Consulti...,3rd Floor Water Infiltration,NaT,Jones Lang LaSalle Incorporated,,
80267,R01.2018.009311,001,001,Jul 9 2018 12:00AM,JLL / SSI,Fieldwork & Reporting,,1940 and 2000 Duke Street Roof Consulting,Fieldwork & Reporting,NaT,Jones Lang LaSalle Incorporated,,


In [83]:
# df_CL[df_CL['ClientID'] == 'CBRE']
# df_PR[df_PR['ClientID'] == '3TI']
## these values are earliest project not earliest R# so they don't match above, just a more general check
# df_PR['StartDate'][df_PR['ClientID'] == "JLL / SSI"].unique() # Oct 2 2001 12:00AM
# df_PR['StartDate'][df_PR['ClientID'] == "3TI"].value_counts() # Nov 14 2011 12:00AM
# df_PR['StartDate'][df_PR['ClientID'] == "CBRE"].unique() # May 24 2002 12:00AM
# that's way earlier, I should double check this one
# df_PR['StartDate'][df_PR['ClientID'] == "NPS"].unique() # Aug 9 2004 12:00AM'

array(['Apr  1 2010 12:00AM', 'Oct  1 2015 12:00AM',
       'Aug 16 2010 12:00AM', 'Sep 16 2019 12:00AM',
       'Nov  1 2019 12:00AM', 'Dec 31 2020 12:00AM', nan,
       'Dec 30 2004 12:00AM', 'Aug  9 2004 12:00AM',
       'Apr  1 2007 12:00AM', 'Dec  2 2009 12:00AM',
       'Oct  1 2007 12:00AM', 'Jan 25 2010 12:00AM',
       'Oct  6 2004 12:00AM', 'Jan  3 2005 12:00AM',
       'Oct  9 2006 12:00AM', 'Oct 12 2004 12:00AM',
       'Feb 24 2010 12:00AM', 'Sep  4 2020 12:00AM',
       'Aug  4 2020 12:00AM', 'Oct  1 2021 12:00AM',
       'Sep  1 2020 12:00AM', 'Mar 15 2020 12:00AM',
       'Oct  1 2020 12:00AM', 'Sep  1 2021 12:00AM',
       'Apr  6 2020 12:00AM', 'Sep  7 2020 12:00AM',
       'Sep 26 2014 12:00AM', 'Mar 20 2020 12:00AM',
       'Oct  1 2022 12:00AM', 'Sep  1 2019 12:00AM',
       'Oct  1 2023 12:00AM', 'Jun  1 2021 12:00AM',
       'Sep 15 2017 12:00AM', 'Aug 15 2022 12:00AM',
       'Jun  1 2014 12:00AM', 'Sep  1 2015 12:00AM',
       'Sep  1 2022 12:00AM', 'Jan  1 202

Looks like it passed the test, so let's do this for real.

array(['MTOOMEY1252935719500', 'MPORTALATIN1209569252051', 'HILTONMINN',
       ..., '5D8B5763E1154ECD8AE7F0D539B0DBA2',
       '0F61DB021B04413AB1C3C0648AA5F66D', 'EPARRENT1248969111483'],
      dtype=object)

In [10]:
client_list
df_CL['DateAcquired'] = None

for client in client_test_list:
    CL_rows = df_CL[df_CL['ClientID'] == client]
    # print(len(CL_rows))
    PR_slice = df_PR[df_PR['ClientID'] == client]
    client_projects = PR_slice['WBS1'].value_counts().index
    # print(client_projects)
    
    R_codes = [project for project in client_projects if "R" in project]
    P_codes = [project for project in client_projects if "P" in project]
    
    result_dict = {}
    for code in R_codes:
        # Filter DataFrame for the current R_code
        one_project = df_PR[df_PR['WBS1'] == code]

        # Find the minimum StartDate for the current group
        one_project['StartDate'] = one_project['StartDate'].astype(str)        
        min_start_date = one_project['StartDate'].min()
        # print(f"Client: {client}, Code: {code}:\n{min_start_date}")
        ## looks like that's working correctly now
                
        # Store the result in the dictionary
        result_dict[code] = min_start_date
        
        # Get the earliest R code date.
        if not result_dict:
            pass
        else:
            acquire_date = min(result_dict.items(), key=operator.itemgetter(1))
            # next steps?

        # set that value in df_CL as DateAcquired
        df_CL['DateAcquired'][df_CL['ClientID'] == client] = acquire_date[1]

NameError: name 'client_list' is not defined

In [41]:
df_PR['StartDate'][df_PR['WBS1'] == 'R01.2004.004058'].value_counts()

Oct 11 2004 12:00AM    5
Aug  4 2004 12:00AM    2
Sep  7 2004 12:00AM    2
Sep 17 2004 12:00AM    2
Name: StartDate, dtype: int64

In [34]:
df_PR[df_PR['ClientID'] == 'CBRE']['WBS1'].value_counts().index

Index(['R01.2003.003510', 'R05.2007.000289', 'R01.2001.002608',
       'R07.2005.000005', 'R06.2004.000022', 'P05.2008.000438',
       'R01.2005.004544', 'R05.2013.000850', 'R05.2008.000436',
       'R01.2005.004583',
       ...
       'P05.2008.000436', 'P01.2014.008180', 'P01.2009.006091',
       'P01.2009.006133', 'P01.2009.006159', 'P01.2009.006183',
       'P05.2006.000231', 'P05.2005.000109', 'P03.2011.000469',
       'R08.2015.000363'],
      dtype='object', length=179)

In [26]:
CL_rows

Unnamed: 0,ClientID,ClientName,SpecialtyType,Recommend,ClientCreated
33,3TI,"3T International, Inc.",FM,N,Oct 24 2011 12:51PM


In [None]:

client_projects = PR_slice['WBS1'].value_counts().index

R_codes = [project for project in client_projects if "R" in project]
P_codes = [project for project in client_projects if "P" in project]

result_dict = {}
for code in R_codes:

### Then, what labor happened before the acquisition date


In [None]:
## LD should have dates as part of it's info
# ((later, add looped steps for phase and task, we'll need those totals too, but they have to happen later.))
# slicing down to the project and the pre-acquisition date is the first step

# zero in: first, LD entries for a given WBS1


#### Include OHD labor


In [None]:
### then, calculate at 

## CLTV
Client Lifetime Value
- Break down by customer segment, associate like with like
- Project profitability?  (Ask Angie)
- Look at distribution of project cycles - how many years between projects to estimate when life cycle "ends"
- Clients with at least one R number are "acquired"
- May want to break down P by phases?
- for any task what is the period between asks for money, based on billing dates.
- periods between bills in a given project?

### Client Lifetimes
- Calendar length of lifetime
- labor hours on client post-acquisition
 - further divided by P/R
- \# of projects during lifetime
- calculate per-client lifetime values, then get avg
- further subdivide by segment or similar

### ARPC
- Maybe profit per, not revenue
- also break out by segment / or type of work

In [38]:
# do I need to calculate gross revenue then get profit from that?
# or can I assess profit directly somehow?
## Go looking through AP/AR
## Also ask Angie
## Vision should have gross profit in some form

#### Revenue/Profit per Project, also

## Misc Metrics

### Conversions

### Churn
- look at this year by year

Linreg model - can we attribute any predictive power to details we know about a client in the early stages?

Segment, location, hours in acquisition, particular people putting in acquisition labor perhaps?

Month of the year in which we do the labor is likely also a factor.

##### Miscellaneous one-off code cells
Below: checking addresses of certain clients/projects to determine whether they were same client or no

In [5]:
df = pd.read_excel('Vision_Data/Vision Data Master Copy.xlsx', sheet_name="PR")

In [20]:
df.columns[40:50]

Index(['DefaultTaskType', 'VersionID', 'ContactID', 'CLBillingAddr',
       'LongName', 'Address1', 'Address2', 'Address3', 'City', 'State'],
      dtype='object')

In [33]:
# df['Address1'][df['ClientID'].str.contains('EPARRENT1178828594744', na=False)]
# df['Address1'][df['ClientID'].str.contains('830F1297FCE644CCA2088FB79F07BD25', na=False)]
# df['Address2'][df['ClientID'].str.contains('EPARRENT1178828594744', na=False)]
# df['Address2'][df['ClientID'].str.contains('830F1297FCE644CCA2088FB79F07BD25', na=False)]
# df['Address3'][df['ClientID'].str.contains('EPARRENT1178828594744', na=False)]
# df['Address3'][df['ClientID'].str.contains('830F1297FCE644CCA2088FB79F07BD25', na=False)]
# df['City'][df['ClientID'].str.contains('EPARRENT1178828594744', na=False)]
# df['City'][df['ClientID'].str.contains('830F1297FCE644CCA2088FB79F07BD25', na=False)]
# df['State'][df['ClientID'].str.contains('EPARRENT1178828594744', na=False)]
# df['State'][df['ClientID'].str.contains('830F1297FCE644CCA2088FB79F07BD25', na=False)]

56549    MD
56550    MD
56551    MD
56552    MD
59937    MD
62154    MD
62155    MD
65020    MD
68905    MD
68906    MD
72517    MD
Name: State, dtype: object

In [13]:
df1 = pd.read_csv('Vision_Data/SQL/Vision Contacts.csv')

  df1 = pd.read_csv('Vision_Data/SQL/Vision Contacts.csv')


In [14]:
df1.columns

Index(['ContactID', 'ClientID', 'CLAddress', 'Vendor', 'VEAddress', 'Type',
       'LastName', 'FirstName', 'MiddleName', 'Salutation', 'Suffix', 'Title',
       'Addressee', 'Address1', 'Address2', 'Address3', 'Address4', 'City',
       'State', 'ZIP', 'Country', 'Phone', 'Fax', 'Pager', 'CellPhone',
       'HomePhone', 'EMail', 'MailingAddress', 'Billing', 'PrimaryInd',
       'ContactStatus', 'PreferredName', 'CustomCurrencyCode', 'Source',
       'CreateUser', 'CreateDate', 'ModUser', 'ModDate', 'PhoneFormat',
       'FaxFormat', 'PagerFormat', 'CellPhoneFormat', 'HomePhoneFormat',
       'AjeraSync', 'TLInternalKey', 'TLSyncModDate'],
      dtype='object')

In [15]:
df1 = df[df['Vendor'].isna()]

In [16]:
df1.shape

(25392, 46)

In [17]:
len(df1['ClientID'].unique())

5739

In [18]:
df2

NameError: name 'df2' is not defined