In [1]:
import operator
import pandas as pd, numpy as np, matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
df_CL = pd.read_pickle('Vision_Data/working_data/CL_WIPCOPY_1220.pkl')
df_PR = pd.read_pickle('Vision_Data/working_data/PR_WIPCOPY_1220.pkl')

Now to identify some potentially interesting features

In [3]:
df_PR.columns

Index(['WBS1', 'WBS2', 'WBS3', 'StartDate', 'ClientID', 'Name', 'ContractDate',
       'ProjectName', 'PhaseName', 'EndDate', 'ClientName', 'SpecialtyType',
       'Recommend'],
      dtype='object')

In [4]:
df1 = df_PR[['WBS1', 'StartDate', 'ClientID', 'Name', 'ContractDate',
       'ProjectName', 'EndDate', 'ClientName', 'SpecialtyType',
       'Recommend']]

In [5]:
# just the top level for now
df1.drop_duplicates(subset=['WBS1'], inplace=True)
# Drop rows where 'WBS1' contains 'OHD'
df1 = df1[~df1['WBS1'].str.contains('OHD')]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop_duplicates(subset=['WBS1'], inplace=True)


#### Basic feature engineering
- Did we win the project? Y/N
 - How many hours on the P side?
 - How does that compare to the R?
- Did we win the client?
 - When did we win the client?
  - How many hours did we put in to win them?
- Total number projects associated with client
 - projects per year

## CAC
Customer Acquisition Cost
- date R number created is when "acquired"
- acquisition labor is not consistently associated with "prop" and project codes esp. further back.
- All BDev labor not assoc. divided evenly between projects?
- identify individuals who worked on what project and divide based on that.
- limit to past 5 years or less

In [39]:
## May want to find a way to find when we have same contact with different client (ids or names)
## date of acquisition may make one client actually two

### First, need to determine when client is "acquired"

In [6]:
# Create 'Column A' and 'Column B' based on 'WBS1' to show proposals vs. projects
df1['Column A'] = df1['WBS1'].str[0] # P or R
df1['Column B'] = df1['WBS1'].str[1:] # the rest of the code

In [7]:
# Create a list to store the data for df2
temp_list = []

# Iterate over unique values in 'Column B'
for column_b_value in df1['Column B'].unique():
    # Filter rows in the original DataFrame based on 'Column B' value
    filtered_rows = df1[df1['Column B'] == column_b_value]

    # Check if any row has 'Column A' value of 'R'
    bid_won = 'Y' if 'R' in filtered_rows['Column A'].values else 'N'

    # Get the 'ClientID' from the first row
    client_id = filtered_rows.iloc[0]['ClientID']

    # Append the data to the list
    temp_list.append([column_b_value, bid_won, client_id])

# Create df2 from the collected data
df2 = pd.DataFrame(temp_list, columns=['Column B', 'BidWon', 'ClientID'])

In [8]:
df2.head()

Unnamed: 0,Column B,BidWon,ClientID
0,01.2012.007235,Y,CONGBETHEM
1,01.2013.007643,Y,MLEEMAN1248814353279
2,01.2013.007752,Y,UNIVOFME
3,08.2013.000226,N,61766EBBFF364DD491597C9B2A54F3C3
4,01.2013.007754,N,TLARSON1231196287172


In [9]:
df1['StartDate'].isna().sum()

1946

In [10]:
df_CL.columns

Index(['ClientID', 'ClientName', 'SpecialtyType', 'Recommend',
       'ClientCreated'],
      dtype='object')

Now to get P and R counts seperately as their own columns, added to DF_CL

In [18]:
# getting the count of unique projects for a client
len(df1[df1['ClientID'] == 'UNIVOFME']['WBS1'].unique())

11

In [21]:
df2['R_Count'] = None
df2['P_Count'] = None

In [23]:
# now get separate P and R counts
df_temp = df1[df1['ClientID'] == 'UNIVOFME']
temp_counts = df_temp['Column A'].value_counts()
df2['P_Count'][df2['ClientID'] == 'UNIVOFME'] = temp_counts[0]
df2['R_Count'][df2['ClientID'] == 'UNIVOFME'] = temp_counts[1]

In [24]:
df2[df2['ClientID'] == 'UNIVOFME']

Unnamed: 0,Column B,BidWon,ClientID,R_Count,P_Count
2,01.2013.007752,Y,UNIVOFME,5,6
536,01.2015.008619,N,UNIVOFME,5,6
1017,01.2014.008308,Y,UNIVOFME,5,6
3209,01.2006.004953,Y,UNIVOFME,5,6
9330,01.2014.008055,Y,UNIVOFME,5,6
9736,01.2023.010031,Y,UNIVOFME,5,6


Now I need to bring in the data from CL, mapped accordingly to ClientIDs

In [25]:
client_list = df1['ClientName'].unique()
client_id_list = df1['ClientID'].unique()

In [37]:
# suppress annoying warning message spam
pd.options.mode.chained_assignment = None  # default='warn'

df2['DateAcquired'] = None

for client_id in client_id_list:
    CL_rows = df2[df2['ClientID'] == client_id]
    # print(len(CL_rows))
    PR_slice = df1[df1['ClientID'] == client_id]
    client_projects = PR_slice['WBS1'].value_counts().index
    # print(client_projects)
    
    R_codes = [project for project in client_projects if "R" in project]
    P_codes = [project for project in client_projects if "P" in project]
    
    result_dict = {}
    for code in R_codes:
        # Filter DataFrame for the current R_code
        one_project = df1[df1['WBS1'] == code]

        # Find the minimum StartDate for the current group
        one_project['StartDate'] = one_project['StartDate'].astype(str)        
        min_start_date = one_project['StartDate'].min()
        # print(f"Client: {client_id}, Code: {code}:\n{min_start_date}")
        ## looks like that's working correctly now
                
        # Store the result in the dictionary
        result_dict[code] = min_start_date
        
        # Get the earliest R code date.
        if not result_dict:
            pass
        else:
            acquire_date = min(result_dict.items(), key=operator.itemgetter(1))
            # next steps?

        # set that value in df_CL as DateAcquired
        df2['DateAcquired'][df2['ClientID'] == client_id] = acquire_date[1]

In [43]:
### some version of this will give me my 'BidWonDate'
for client in client_list:
    CL_row = df_CL[df_CL['ClientID'] == client]
    PR_slice = df_PR[df_PR['ClientID'] == client]
    client_projects = PR_slice['WBS1'].value_counts().index
    
    R_codes = [project for project in client_projects if "R" in project]
    P_codes = [project for project in client_projects if "P" in project]
    
    result_dict = {}
    for code in R_codes:
        # Filter DataFrame for the current R_code
        one_project = df[df['WBS1'] == code]

        # Find the minimum StartDate for the current group
        min_start_date = one_project['StartDate'].min()

        # Store the result in the dictionary
        result_dict[code] = min_start_date
        
    # Get the earliest R code date.
    if not result_dict:
        pass
    else:
        acquire_date = min(result_dict.items(), key=lambda x: x[1])
        # rest of your code

    # set that value in df_CL as DateAcquired
    df_CL['DateAcquired'][df_CL['ClientID'] == client] = acquire_date[1]

TypeError: '<=' not supported between instances of 'str' and 'float'

The above was my original code to do this, unsure why it doesn't work now.  Working on troubleshooting.

(one-off attempts below)

In [26]:
## Can use these as test cases
# JLL / SSI
# 3TI
# CBRE
# NPS
client_test_list = ['JLL / SSI', '3TI', 'CBRE', 'NPS']

In [28]:
df_CL['DateAcquired'] = None

for client in client_test_list:
    CL_rows = df_CL[df_CL['ClientID'] == client]
    # print(len(CL_rows))
    PR_slice = df_PR[df_PR['ClientID'] == client]
    client_projects = PR_slice['WBS1'].value_counts().index
    # print(client_projects)
    
    R_codes = [project for project in client_projects if "R" in project]
    P_codes = [project for project in client_projects if "P" in project]
    
    result_dict = {}
    for code in R_codes:
        # Filter DataFrame for the current R_code
        one_project = df_PR[df_PR['WBS1'] == code]

        # Find the minimum StartDate for the current group
        one_project['StartDate'] = one_project['StartDate'].astype(str)        
        min_start_date = one_project['StartDate'].min()
        # print(f"Client: {client}, Code: {code}:\n{min_start_date}")
        ## looks like that's working correctly now
                
        # Store the result in the dictionary
        result_dict[code] = min_start_date
        
        # Get the earliest R code date.
        if not result_dict:
            pass
        else:
            acquire_date = min(result_dict.items(), key=operator.itemgetter(1))
            # next steps?

        # set that value in df_CL as DateAcquired
        df_CL['DateAcquired'][df_CL['ClientID'] == client] = acquire_date[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_project['StartDate'] = one_project['StartDate'].astype(str)
A value is tr

In [33]:
# df_CL[df_CL['ClientID'] == 'JLL / SSI']
# df_CL[df_CL['ClientID'] == '3TI']
# df_CL[df_CL['ClientID'] == 'CBRE']
# df_CL[df_CL['ClientID'] == 'NPS']

# df_PR[df_PR['ClientID'] == 'JLL / SSI']
# df_PR[df_PR['ClientID'] == '3TI']
# df_PR[df_PR['ClientID'] == 'CBRE']
# df_PR[df_PR['ClientID'] == 'NPS']

Unnamed: 0,WBS1,WBS2,WBS3,StartDate,ClientID,Name,ContractDate,ProjectName,PhaseName,EndDate,ClientName,SpecialtyType,Recommend
125,R05.2012.000820,004,001,Dec 27 2012 12:00AM,JLL / SSI,Design Level Evaluation,Jan 14 2013 12:00AM,200 S Biscayne Southeast Financial Center Engi...,Design Level Evaluation,NaT,Jones Lang LaSalle Incorporated,,
126,R05.2012.000820,004,002,Dec 27 2012 12:00AM,JLL / SSI,Construction Documents,Jan 14 2013 12:00AM,200 S Biscayne Southeast Financial Center Engi...,Design Level Evaluation,NaT,Jones Lang LaSalle Incorporated,,
127,R05.2012.000820,004,003,Dec 27 2012 12:00AM,JLL / SSI,Bidding Services,Jan 14 2013 12:00AM,200 S Biscayne Southeast Financial Center Engi...,Design Level Evaluation,NaT,Jones Lang LaSalle Incorporated,,
128,R05.2012.000820,004,004,Dec 27 2012 12:00AM,JLL / SSI,Contract Administration Services,Jan 14 2013 12:00AM,200 S Biscayne Southeast Financial Center Engi...,Design Level Evaluation,NaT,Jones Lang LaSalle Incorporated,,
129,R05.2012.000820,004,005,Dec 27 2012 12:00AM,JLL / SSI,Fall Protection Load Testing,Jan 14 2013 12:00AM,200 S Biscayne Southeast Financial Center Engi...,Design Level Evaluation,NaT,Jones Lang LaSalle Incorporated,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
78783,R01.2014.008101,001,001,May 5 2014 12:00AM,JLL / SSI,Presentation,,JLL 2014 Engineering Operations Conference,Presentation,NaT,Jones Lang LaSalle Incorporated,,
78871,P08.2015.000340,PROP,PROP,Aug 5 2015 12:00AM,JLL / SSI,Proposal,,JLL 1515 Poydras St. Chiller No.2 Repair Evalu...,Proposal,NaT,Jones Lang LaSalle Incorporated,,
79143,R01.2018.009401,001,001,Nov 26 2018 12:00AM,JLL / SSI,3rd Floor Water Infiltration,Dec 7 2018 12:00AM,9755 Patuxent Woods Drive Engineering Consulti...,3rd Floor Water Infiltration,NaT,Jones Lang LaSalle Incorporated,,
80267,R01.2018.009311,001,001,Jul 9 2018 12:00AM,JLL / SSI,Fieldwork & Reporting,,1940 and 2000 Duke Street Roof Consulting,Fieldwork & Reporting,NaT,Jones Lang LaSalle Incorporated,,


In [83]:
# df_CL[df_CL['ClientID'] == 'CBRE']
# df_PR[df_PR['ClientID'] == '3TI']
## these values are earliest project not earliest R# so they don't match above, just a more general check
# df_PR['StartDate'][df_PR['ClientID'] == "JLL / SSI"].unique() # Oct 2 2001 12:00AM
# df_PR['StartDate'][df_PR['ClientID'] == "3TI"].value_counts() # Nov 14 2011 12:00AM
# df_PR['StartDate'][df_PR['ClientID'] == "CBRE"].unique() # May 24 2002 12:00AM
# that's way earlier, I should double check this one
# df_PR['StartDate'][df_PR['ClientID'] == "NPS"].unique() # Aug 9 2004 12:00AM'

array(['Apr  1 2010 12:00AM', 'Oct  1 2015 12:00AM',
       'Aug 16 2010 12:00AM', 'Sep 16 2019 12:00AM',
       'Nov  1 2019 12:00AM', 'Dec 31 2020 12:00AM', nan,
       'Dec 30 2004 12:00AM', 'Aug  9 2004 12:00AM',
       'Apr  1 2007 12:00AM', 'Dec  2 2009 12:00AM',
       'Oct  1 2007 12:00AM', 'Jan 25 2010 12:00AM',
       'Oct  6 2004 12:00AM', 'Jan  3 2005 12:00AM',
       'Oct  9 2006 12:00AM', 'Oct 12 2004 12:00AM',
       'Feb 24 2010 12:00AM', 'Sep  4 2020 12:00AM',
       'Aug  4 2020 12:00AM', 'Oct  1 2021 12:00AM',
       'Sep  1 2020 12:00AM', 'Mar 15 2020 12:00AM',
       'Oct  1 2020 12:00AM', 'Sep  1 2021 12:00AM',
       'Apr  6 2020 12:00AM', 'Sep  7 2020 12:00AM',
       'Sep 26 2014 12:00AM', 'Mar 20 2020 12:00AM',
       'Oct  1 2022 12:00AM', 'Sep  1 2019 12:00AM',
       'Oct  1 2023 12:00AM', 'Jun  1 2021 12:00AM',
       'Sep 15 2017 12:00AM', 'Aug 15 2022 12:00AM',
       'Jun  1 2014 12:00AM', 'Sep  1 2015 12:00AM',
       'Sep  1 2022 12:00AM', 'Jan  1 202

Looks like it passed the test, so let's do this for real.

array(['MTOOMEY1252935719500', 'MPORTALATIN1209569252051', 'HILTONMINN',
       ..., '5D8B5763E1154ECD8AE7F0D539B0DBA2',
       '0F61DB021B04413AB1C3C0648AA5F66D', 'EPARRENT1248969111483'],
      dtype=object)

In [None]:
client_list
df_CL['DateAcquired'] = None

for client in client_test_list:
    CL_rows = df_CL[df_CL['ClientID'] == client]
    # print(len(CL_rows))
    PR_slice = df_PR[df_PR['ClientID'] == client]
    client_projects = PR_slice['WBS1'].value_counts().index
    # print(client_projects)
    
    R_codes = [project for project in client_projects if "R" in project]
    P_codes = [project for project in client_projects if "P" in project]
    
    result_dict = {}
    for code in R_codes:
        # Filter DataFrame for the current R_code
        one_project = df_PR[df_PR['WBS1'] == code]

        # Find the minimum StartDate for the current group
        one_project['StartDate'] = one_project['StartDate'].astype(str)        
        min_start_date = one_project['StartDate'].min()
        # print(f"Client: {client}, Code: {code}:\n{min_start_date}")
        ## looks like that's working correctly now
                
        # Store the result in the dictionary
        result_dict[code] = min_start_date
        
        # Get the earliest R code date.
        if not result_dict:
            pass
        else:
            acquire_date = min(result_dict.items(), key=operator.itemgetter(1))
            # next steps?

        # set that value in df_CL as DateAcquired
        df_CL['DateAcquired'][df_CL['ClientID'] == client] = acquire_date[1]

In [41]:
df_PR['StartDate'][df_PR['WBS1'] == 'R01.2004.004058'].value_counts()

Oct 11 2004 12:00AM    5
Aug  4 2004 12:00AM    2
Sep  7 2004 12:00AM    2
Sep 17 2004 12:00AM    2
Name: StartDate, dtype: int64

In [34]:
df_PR[df_PR['ClientID'] == 'CBRE']['WBS1'].value_counts().index

Index(['R01.2003.003510', 'R05.2007.000289', 'R01.2001.002608',
       'R07.2005.000005', 'R06.2004.000022', 'P05.2008.000438',
       'R01.2005.004544', 'R05.2013.000850', 'R05.2008.000436',
       'R01.2005.004583',
       ...
       'P05.2008.000436', 'P01.2014.008180', 'P01.2009.006091',
       'P01.2009.006133', 'P01.2009.006159', 'P01.2009.006183',
       'P05.2006.000231', 'P05.2005.000109', 'P03.2011.000469',
       'R08.2015.000363'],
      dtype='object', length=179)

In [26]:
CL_rows

Unnamed: 0,ClientID,ClientName,SpecialtyType,Recommend,ClientCreated
33,3TI,"3T International, Inc.",FM,N,Oct 24 2011 12:51PM


In [None]:

client_projects = PR_slice['WBS1'].value_counts().index

R_codes = [project for project in client_projects if "R" in project]
P_codes = [project for project in client_projects if "P" in project]

result_dict = {}
for code in R_codes:

### Then, what labor happened before the acquisition date

#### Include OHD labor


In [None]:
### then, calculate at 

## CLTV
Client Lifetime Value
- Break down by customer segment, associate like with like
- Project profitability?  (Ask Angie)
- Look at distribution of project cycles - how many years between projects to estimate when life cycle "ends"
- Clients with at least one R number are "acquired"
- May want to break down P by phases?
- for any task what is the period between asks for money, based on billing dates.
- periods between bills in a given project?

### Client Lifetimes
- Calendar length of lifetime
- labor hours on client post-acquisition
 - further divided by P/R
- \# of projects during lifetime
- calculate per-client lifetime values, then get avg
- further subdivide by segment or similar

### ARPC
- Maybe profit per, not revenue
- also break out by segment / or type of work

In [38]:
# do I need to calculate gross revenue then get profit from that?
# or can I assess profit directly somehow?
## Go looking through AP/AR
## Also ask Angie
## Vision should have gross profit in some form

#### Revenue/Profit per Project, also

## Misc Metrics

### Conversions

### Churn
- look at this year by year

Linreg model - can we attribute any predictive power to details we know about a client in the early stages?

Segment, location, hours in acquisition, particular people putting in acquisition labor perhaps?

Month of the year in which we do the labor is likely also a factor.

##### Miscellaneous one-off code cells
Below: checking addresses of certain clients/projects to determine whether they were same client or no

In [5]:
df = pd.read_excel('Vision_Data/Vision Data Master Copy.xlsx', sheet_name="PR")

In [20]:
df.columns[40:50]

Index(['DefaultTaskType', 'VersionID', 'ContactID', 'CLBillingAddr',
       'LongName', 'Address1', 'Address2', 'Address3', 'City', 'State'],
      dtype='object')

In [33]:
# df['Address1'][df['ClientID'].str.contains('EPARRENT1178828594744', na=False)]
# df['Address1'][df['ClientID'].str.contains('830F1297FCE644CCA2088FB79F07BD25', na=False)]
# df['Address2'][df['ClientID'].str.contains('EPARRENT1178828594744', na=False)]
# df['Address2'][df['ClientID'].str.contains('830F1297FCE644CCA2088FB79F07BD25', na=False)]
# df['Address3'][df['ClientID'].str.contains('EPARRENT1178828594744', na=False)]
# df['Address3'][df['ClientID'].str.contains('830F1297FCE644CCA2088FB79F07BD25', na=False)]
# df['City'][df['ClientID'].str.contains('EPARRENT1178828594744', na=False)]
# df['City'][df['ClientID'].str.contains('830F1297FCE644CCA2088FB79F07BD25', na=False)]
# df['State'][df['ClientID'].str.contains('EPARRENT1178828594744', na=False)]
# df['State'][df['ClientID'].str.contains('830F1297FCE644CCA2088FB79F07BD25', na=False)]

56549    MD
56550    MD
56551    MD
56552    MD
59937    MD
62154    MD
62155    MD
65020    MD
68905    MD
68906    MD
72517    MD
Name: State, dtype: object

In [13]:
df1 = pd.read_csv('Vision_Data/SQL/Vision Contacts.csv')

  df1 = pd.read_csv('Vision_Data/SQL/Vision Contacts.csv')


In [14]:
df1.columns

Index(['ContactID', 'ClientID', 'CLAddress', 'Vendor', 'VEAddress', 'Type',
       'LastName', 'FirstName', 'MiddleName', 'Salutation', 'Suffix', 'Title',
       'Addressee', 'Address1', 'Address2', 'Address3', 'Address4', 'City',
       'State', 'ZIP', 'Country', 'Phone', 'Fax', 'Pager', 'CellPhone',
       'HomePhone', 'EMail', 'MailingAddress', 'Billing', 'PrimaryInd',
       'ContactStatus', 'PreferredName', 'CustomCurrencyCode', 'Source',
       'CreateUser', 'CreateDate', 'ModUser', 'ModDate', 'PhoneFormat',
       'FaxFormat', 'PagerFormat', 'CellPhoneFormat', 'HomePhoneFormat',
       'AjeraSync', 'TLInternalKey', 'TLSyncModDate'],
      dtype='object')

In [15]:
df1 = df[df['Vendor'].isna()]

In [16]:
df1.shape

(25392, 46)

In [17]:
len(df1['ClientID'].unique())

5739

In [18]:
df2

NameError: name 'df2' is not defined