<img width="8%" alt="Google Sheets.png" src="https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/.github/assets/logos/Google%20Sheets.png" style="border-radius: 15%">

# Google Sheets - Update leads

**Tags:** #googlesheets #gsheet #data #naas_drivers #growth #leads #openai #linkedin

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel/)

**Description:** This notebook updates your leads database with new people that interacted with content and enrich it with ICP, company and check if they already exists inside your CRM.

## Input

### Import libraries

In [None]:
import naas_data_product
import naas
from naas_drivers import gsheet, linkedin
import pandas as pd
import os
from datetime import date
import openai
import time
import re

### Setup variables
**Inputs**
- `input_dir`: Input directory to retrieve file from.
- `file_interactions`: Name of the file to be retrieved.
- `openai_api_key`: OpenAI API Key.
- `li_at`: Cookie used to authenticate Members and API clients.
- `JSESSIONID`: Cookie used for Cross Site Request Forgery (CSRF) protection and URL signature validation.
- `prompt_icp`: Prompt to be used to categorize profile by ICP.
- `spreadsheet_url`: Google Sheets spreadsheet URL.
- `ref_contacts_name`: Google Sheets sheet name storing contact from your CRM.
- `leads_profiles_name`: Google Sheets sheet name storing leads profiles.

**Outputs**
- `output_dir`: Output directory to save file to.
- `file_leads`: Output file name to save as picke.

In [None]:
# Inputs
input_dir = os.path.join(naas_data_product.OUTPUTS_PATH, "growth-engine", date.today().isoformat())
file_interactions = "linkedin_interactions"
openai_api_key = naas.secret.get("OPENAI_API_KEY")
li_at = naas.secret.get("LINKEDIN_LI_AT") or "YOUR_LINKEDIN_LI_AT" #example: AQFAzQN_PLPR4wAAAXc-FCKmgiMit5FLdY1af3-2
JSESSIONID = naas.secret.get("LINKEDIN_JSESSIONID") or "YOUR_LINKEDIN_JSESSIONID" #example: ajax:8379907400220387585
prompt_icp = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entity"), "prompt_icp") or "YOUR_PROMPT_ICP"
spreadsheet_url = naas.secret.get("ABI_SPREADSHEET") or "YOUR_GOOGLE_SPREADSHEET_URL"
ref_contacts_name = "REF_CONTACTS"
leads_profiles_name = "LEADS"

# Outputs
output_dir = os.path.join(naas_data_product.OUTPUTS_PATH, "growth-engine", date.today().isoformat())
file_leads = "leads"

## Model

### Get existing leads

In [None]:
df_leads = gsheet.connect(spreadsheet_url).get(sheet_name=leads_profiles_name)
if not isinstance(df_leads, pd.DataFrame):
    df_leads = pd.DataFrame()
print("- Existing Leads:", len(df_leads))
# df_leads.head(3)

### Get profiles from interactions

In [None]:
def get_unique_profile(
    df_init
):
    # Init
    df_profiles = df_init.copy()
    df_last_interaction = df_init.copy()

    # Groupby profile
    to_group = [
        "FIRSTNAME",
        "LASTNAME",
        "FULLNAME",
        "OCCUPATION",
        "PROFILE_URL",
        "PUBLIC_ID"
    ]
    to_agg = {
        "INTERACTION_SCORE": "sum"
    }
    df_profiles = df_profiles.groupby(to_group, as_index=False).agg(to_agg).drop_duplicates("PROFILE_URL")
    df_profiles = df_profiles.sort_values(by="INTERACTION_SCORE", ascending=False).reset_index(drop=True)
    
    # Add last interactions data
    to_keep = [
        "PROFILE_URL",
        "PUBLISHED_DATE",
        "CONTENT_URL",
        "CONTENT_TITLE"
    ]
    df_last_interaction = df_last_interaction[to_keep].drop_duplicates().drop_duplicates(["PROFILE_URL"])
    
    # Merge dfs
    df = pd.merge(df_profiles, df_last_interaction, how="left")
    to_rename = {
        "PUBLISHED_DATE": "LAST_INTERACTION_DATE",
        "CONTENT_URL": "LAST_CONTENT_URL_INTERACTION",
        "CONTENT_TITLE": "LAST_CONTENT_TITLE_INTERACTION"
    }
    df = df.rename(columns=to_rename)
    df["LAST_INTERACTION_DATE"] = pd.to_datetime(df["LAST_INTERACTION_DATE"].str[:-5]).dt.strftime("%a. %d %b.")
    df = df.sort_values(by=["INTERACTION_SCORE", "LAST_INTERACTION_DATE"], ascending=[False, False])
    
    # Filter on profile
    df = df[df["PROFILE_URL"].str.contains("https://www.linkedin.com/in/.+")]
    df = df.sort_values(by=["INTERACTION_SCORE"], ascending=[False]).reset_index(drop=True)
    return df.reset_index(drop=True)

df_interactions = pload(input_dir, file_interactions)    
print('- Interactions:', len(df_interactions))

df_interactions_d = get_unique_profile(df_interactions)
print("- Profiles interactions:", len(df_interactions_d))
# df_interactions_d.head(3)

### Create leads database

In [None]:
def get_interactions_by_profile(
    df_init,
    contacts
):
    # Init
    df = df_init.copy()
    interactions = {}
    
    # Cleaning
    to_select = [
        "PROFILE_URL",
        "CONTENT_TITLE",
        "CONTENT_URL",
        "INTERACTION",
        "INTERACTION_CONTENT"
    ]
    df = df[to_select].sort_values(by="PROFILE_URL").reset_index(drop=True)
    df["INTERACTION_TEXT"] = ""
    df.loc[df["INTERACTION"] == "POST_REACTION", "INTERACTION_TEXT"] = "Sent '" + df["INTERACTION_CONTENT"].str.lower() + "' reaction to '" + df["CONTENT_TITLE"].str.strip() + "' (" + df["CONTENT_URL"] + ")"
    df.loc[df["INTERACTION"] == "POST_COMMENT", "INTERACTION_TEXT"] = "Comment '" + df["INTERACTION_CONTENT"].str.capitalize() + "' on '" + df["CONTENT_TITLE"].str.strip() + "' (" + df["CONTENT_URL"] + ")"

    # Create interactions by profile
    for contact in contacts:
        tmp_df = df.copy()
        tmp_df = tmp_df[tmp_df["PROFILE_URL"] == contact].reset_index(drop=True)
        interests = ""
        for row in tmp_df.itertuples():
            interaction_text = row.INTERACTION_TEXT
            interests = f"{interests}{interaction_text}, "
        interactions[contact] = interests.strip()
    return interactions

def remove_emojis(text):
    # Emoji pattern
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    # Remove emojis from the text
    text = emoji_pattern.sub(r'', text)
    return text.strip()

def create_db_leads(
    df_leads,
    df_direct,
    output_dir,
):
    # Init
    df = pd.DataFrame()
    
    # Get meta data from leads
    if len(df_leads) > 0:
        meta_columns = [
            "NOTES",
            "COMPANY",
            "ICP",
            "CRM_CONTACT",
        ]
        for c in meta_columns:
            if c not in df_leads.columns:
                df_leads[c] = "TBD"
        columns = ["PROFILE_URL"] + meta_columns
        ref = df_leads[columns]
    
    # Concat with with leads init
    df = pd.concat([df_direct, df_leads]).drop_duplicates("PROFILE_URL")
    
    # Merge to get meta data
    df = pd.merge(df.drop(meta_columns, axis=1), ref, how="left").fillna("TBD")

    # Cleaning: Remove emojis from name and occupation
    df["FIRSTNAME"] = df.apply(lambda row: remove_emojis(row["FIRSTNAME"]), axis=1)
    df["LASTNAME"] = df.apply(lambda row: remove_emojis(row["LASTNAME"]), axis=1)
    df["OCCUPATION"] = df.apply(lambda row: remove_emojis(row["OCCUPATION"]), axis=1)
    df["FULLNAME"] = df["FIRSTNAME"] + " " + df["LASTNAME"]
    
    # Create notes from interactions
    leads = df["PROFILE_URL"].unique()  
    df["NOTES"] = df["PROFILE_URL"].map(get_interactions_by_profile(df_interactions, leads))
    
    # Save database
    pdump(output_dir, df, "db_leads")
    return df.reset_index(drop=True)
    
db_leads = create_db_leads(df_leads, df_interactions_d, output_dir)
print("- New database leads:", len(db_leads))
db_leads.head(3)

### Get contacts from CRM

In [None]:
df_contacts = gsheet.connect(spreadsheet_url).get(sheet_name=ref_contacts_name)
if not isinstance(df_contacts, pd.DataFrame):
    df_contacts = pd.DataFrame()
print("CRM Contacts:", len(df_contacts))
# df_contacts.head(3)

### Enrich leads with OpenAI, LinkedIn and CRM matching

In [None]:
def get_icp(profile_url, occupation, icps):
    icp = "NA"
    if profile_url not in icps:
        icp = create_chat_completion(openai_api_key, prompt_icp, occupation).replace("'", "").replace('"', '')
        icps[profile_url] = icp
    else:
        icp = icps.get(profile_url)
    pdump(output_dir, icps, "icps")
    return icp

def get_company(profile_url, occupation, companies):
    prompt_company = """
    I will give you the occupation from a profile I get from LinkedIn, you will return the company you can extract from by checking the word after 'at' or '@'.
    If you don't find it return "NA"
    Don't put the results into quotes.
    """
    company = "NA"
    if profile_url not in companies or companies.get(profile_url) == "TBD":
        company = create_chat_completion(openai_api_key, prompt_company, occupation).replace("'", "").replace('"', '')
        companies[profile_url] = company
    else:
        company = companies.get(profile_url)
    pdump(output_dir, companies, "companies")
    return company

def enrich_leads(
    df_init,
    ref,
    output_dir,
):
    # Init
    df = df_init.copy()

    # Add ICP column and get existing ICPs
    if not "ICP" in df.columns:
        df["ICP"] = "TBD"
        icps = {}
    else:
        icps = pload(output_dir, "icps")
        if icps is None: 
            icps = df[df["ICP"] != "TBD"].set_index('PROFILE_URL')['ICP'].to_dict()
            pdump(output_dir, icps, "icps")

    # Add Company column and get existing companies
    if not "COMPANY" in df.columns:
        df["COMPANY"] = "TBD"
        companies = {}
    else:
        companies = pload(output_dir, "companies")
        if companies is None:
            companies = df[df["COMPANY"] != "TBD"].set_index('PROFILE_URL')['COMPANY'].to_dict()
            pdump(output_dir, companies, "companies")
            
    # Add CRM contact column and match if fullname exists
    if not "CRM_CONTACT" in df.columns:
        df["CRM_CONTACT"] = "TBD"
        crm_contacts = {}
    else:
        crm_contacts = pload(output_dir, "crm_contacts")
        if crm_contacts is None:
            crm_contacts = df[df["CRM_CONTACT"] != "TBD"].set_index('PROFILE_URL')['CRM_CONTACT'].to_dict()
            pdump(output_dir, crm_contacts, "crm_contacts")
    
    # Loop on profile
    call_linkedin = 0
    limit_linkedin = 30
    df_lk = pload(input_dir, "linkedin_top_card_profiles")
    if df_lk is None: 
        df_lk = pd.DataFrame()
        
    for row in df.itertuples():
        index = row.Index
        fullname = row.FULLNAME
        occupation = row.OCCUPATION
        profile_url = row.PROFILE_URL
        icp = row.ICP
        company = row.COMPANY
        interaction_score = row.INTERACTION_SCORE
        crm_contact = row.CRM_CONTACT
        
        # Update ICP and Company from OpenAI
        if icp == "TBD" and company == "TBD":
            print(f"🤖 OpenAI - Finding ICP & Company for '{fullname}': {occupation} ...")
            print(profile_url)
            icp = get_icp(profile_url, occupation, icps)
            company = get_company(profile_url, occupation, companies)
            df.loc[index, "ICP"] = icp.strip()
            df.loc[index, "COMPANY"] = company.strip()
            print("- ICP:", icp)
            print("- Company:", company)
            print()
            
        # Update Company info
        if company == "NA" and interaction_score >= 3 and call_linkedin < limit_linkedin:
            print(f"🕸️LinkedIn - Finding company for '{fullname}' (interaction score: {interaction_score}) ...")
            company_name = "UNKNOWN"
            
            # Get Top Card
            try:
                tmp_df = linkedin.connect(li_at, JSESSIONID).profile.get_top_card(profile_url)
                time.sleep(2)
            except Exception as e:
                print(e)
                company_name = "ERROR_LINKEDIN_ENRICHMENT"
                tmp_df = pd.DataFrame()
            
            # Update company
            if len(tmp_df) > 0:
                df_lk = pd.concat([df_lk, tmp_df])
                pdump(input_dir, df_lk, "linkedin_top_card_profiles")
                if "COMPANY_NAME" in tmp_df.columns:
                    company_name = tmp_df.loc[0, "COMPANY_NAME"]
                    
            print("- Company:", company_name)
            df.loc[index, "COMPANY"] = str(company_name).replace("None", "UNKNOWN").replace("NA", "UNKNOWN").strip()
            call_linkedin += 1
            if call_linkedin >= limit_linkedin:
                print("🛑 Call LinkedIn reached:", limit_linkedin)
            else:
                print("- ⚠️ LinkedIn call:", call_linkedin)
            print()
            
        # Find profile in CRM
        if len(ref) > 0 and crm_contact == "TBD":
            print(f"💛 Sequence Matcher - Finding if '{fullname}' is in CRM ...")
            crm_contact = find_crm_match(
                ref,
                "FULLNAME",
                fullname
            )
            df.loc[index, "CRM_CONTACT"] = crm_contact
            print("- CRM contact:", crm_contact)
            print()
            
    # Cleaning
    to_order = [
        "FULLNAME",
        "OCCUPATION",
        "COMPANY",
        "ICP",
        "CRM_CONTACT",
        "INTERACTION_SCORE",
        "NOTES",
        "LAST_INTERACTION_DATE",
        "LAST_CONTENT_TITLE_INTERACTION",
        "LAST_CONTENT_URL_INTERACTION",
        "FIRSTNAME",
        "LASTNAME",
        "PROFILE_URL",
        "PUBLIC_ID",
    ]
    df = df[to_order]
    return df.reset_index(drop=True)
    
df_leads_update = enrich_leads(
    db_leads,
    df_contacts,
    output_dir,
)  
print("- Leads enriched:", len(df_leads_update))
df_leads_update.head(5)

## Output

### Save data

In [None]:
pdump(output_dir, df_leads_update, file_leads)

### Send "Leads" to spreadsheet

In [None]:
gsheet.connect(spreadsheet_url).send(data=df_leads_update, sheet_name=leads_profiles_name, append=False)