<img width="8%" alt="Google Sheets.png" src="https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/.github/assets/logos/Google%20Sheets.png" style="border-radius: 15%">

# Google Sheets - Update leads companies

**Tags:** #googlesheets #gsheet #data #naas_drivers #growth #leads #companies #openai #linkedin

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel/)

**Description:** This notebook updates your leads companies database enrich it with data from LinkedIn and check if they already exists inside your CRM.

## Input

### Import libraries

In [None]:
from naas_drivers import gsheet, linkedin
import pandas as pd
import os
from datetime import date
import naas_data_product
import openai
import time
from googlesearch import search
import re

### Setup variables
**Inputs**
- `input_dir`: Input directory to retrieve file from.
- `file_interactions`: Name of the file to be retrieved.
- `file_leads`: Name of the file to be retrieved.
- `openai_api_key`: OpenAI API Key.
- `li_at`: Cookie used to authenticate Members and API clients.
- `JSESSIONID`: Cookie used for Cross Site Request Forgery (CSRF) protection and URL signature validation.
- `spreadsheet_url`: Google Sheets spreadsheet URL.
- `ref_companies_name`: Google Sheets sheet name storing companies from your CRM.
- `leads_companies_name`: Google Sheets sheet name storing leads profiles.
- `leads_profiles_name`: Google Sheets sheet name storing leads profiles.

**Outputs**
- `output_dir`: Output directory to save file to.
- `file_leads_companies`: Output file name to save as picke.

In [None]:
# Inputs
input_dir = os.path.join(naas_data_product.OUTPUTS_PATH, "growth-engine", date.today().isoformat())
file_interactions = "linkedin_interactions"
file_leads = "leads"
openai_api_key = naas.secret.get("OPENAI_API_KEY")
li_at = naas.secret.get("LINKEDIN_LI_AT") or "YOUR_LINKEDIN_LI_AT" #example: AQFAzQN_PLPR4wAAAXc-FCKmgiMit5FLdY1af3-2
JSESSIONID = naas.secret.get("LINKEDIN_JSESSIONID") or "YOUR_LINKEDIN_JSESSIONID" #example: ajax:8379907400220387585
li_at = "AQEDAUbQELgCJO_CAAABi_kyBVwAAAGMQ_ic1FYAi6oWqzNLFw-Cg4kYtzX9u9ryALWX5gpSLMzUTvcQ8GHkf-rld4qfPqGFSUM6ULXxRcSWviR6OWawZKtyUJDadT8LUENoDW0sp3FaWTNKoRf71rsI"
JSESSIONID = "ajax:5925025388423453260"
spreadsheet_url = naas.secret.get("ABI_SPREADSHEET") or "YOUR_GOOGLE_SPREADSHEET_URL"
ref_companies_name = "REF_COMPANIES"
leads_companies_name = "LEADS_COMPANIES"
leads_profiles_name = "LEADS"

# Outputs
output_dir = os.path.join(naas_data_product.OUTPUTS_PATH, "growth-engine", date.today().isoformat())
file_leads_companies = "leads_companies"

## Model

### Get existing leads companies

In [None]:
df_leads_companies = gsheet.connect(spreadsheet_url).get(sheet_name=leads_companies_name)
if not isinstance(df_leads_companies, pd.DataFrame):
    df_leads_companies = pd.DataFrame()
print("Leads Companies:", len(df_leads_companies))
# df_leads_companies.head(3)

### Get profiles from interactions

In [None]:
def get_unique_profile(
    df_init
):
    # Init
    df_profiles = df_init.copy()
    df_last_interaction = df_init.copy()

    # Groupby profile
    to_group = [
        "FIRSTNAME",
        "LASTNAME",
        "FULLNAME",
        "OCCUPATION",
        "PROFILE_URL",
        "PUBLIC_ID"
    ]
    to_agg = {
        "INTERACTION_SCORE": "sum"
    }
    df_profiles = df_profiles.groupby(to_group, as_index=False).agg(to_agg).drop_duplicates("PROFILE_URL")
    df_profiles = df_profiles.sort_values(by="INTERACTION_SCORE", ascending=False).reset_index(drop=True)
    
    # Add last interactions data
    to_keep = [
        "PROFILE_URL",
        "PUBLISHED_DATE",
        "CONTENT_URL",
        "CONTENT_TITLE"
    ]
    df_last_interaction = df_last_interaction[to_keep].drop_duplicates().drop_duplicates(["PROFILE_URL"])
    
    # Merge dfs
    df = pd.merge(df_profiles, df_last_interaction, how="left")
    to_rename = {
        "PUBLISHED_DATE": "LAST_INTERACTION_DATE",
        "CONTENT_URL": "LAST_CONTENT_URL_INTERACTION",
        "CONTENT_TITLE": "LAST_CONTENT_TITLE_INTERACTION"
    }
    df = df.rename(columns=to_rename)
    df["LAST_INTERACTION_DATE"] = pd.to_datetime(df["LAST_INTERACTION_DATE"].str[:-5]).dt.strftime("%a. %d %b.")
    df = df.sort_values(by=["INTERACTION_SCORE", "LAST_INTERACTION_DATE"], ascending=[False, False])
    
    # Filter on profile
    df = df[df["PROFILE_URL"].str.contains("https://www.linkedin.com/company/.+")]
    df = df.sort_values(by=["INTERACTION_SCORE"], ascending=[False]).reset_index(drop=True)
    return df.reset_index(drop=True)

df_interactions = pload(input_dir, file_interactions)    
print('- Interactions:', len(df_interactions))

df_interactions_d = get_unique_profile(df_interactions)
print("- Companies interactions:", len(df_interactions_d))
# df_interactions_d.head(3)

### Get leads

In [None]:
df_leads = pload(input_dir, file_leads)
print("- Leads:", len(df_leads))
# df_leads.head(3)

### Create leads database

In [None]:
def create_db_leads(
    df_leads_companies,
    df_direct,
    df_indirect,
    output_dir
):
    # Init
    df = pd.DataFrame()
    
    # Get meta data from leads companies
    if len(df_leads_companies) > 0:
        meta_columns = [
            "LINKEDIN_URL",
            "INDUSTRY",
            "CITY",
            "COUNTRY",
            "STAFF_RANGE",
            "STAFF_COUNT",
            "FOLLOWER_COUNT",
            "WEBSITE",
            "TAGLINE",
            "DESCRIPTION",
            "COMPANY_ID",
            "COMPANY_NAME",
            "COMPANY_URL",
            "CRM_COMPANY"
        ]
        for c in meta_columns:
            if c not in df_leads_companies.columns:
                df_leads_companies[c] = "TBD"
        columns = ["COMPANY"] + meta_columns
        ref = df_leads_companies[columns]
            
    # Get companies from direct interactions
    df_d = df_direct.copy()
    to_keep = [
        "FULLNAME",
        "PROFILE_URL",
        "INTERACTION_SCORE"
    ]
    to_rename = {
        "FULLNAME": "COMPANY",
        "PROFILE_URL": "LINKEDIN_URL",
        "INTERACTION_SCORE": "DIRECT_INTERACTIONS"
    }
    df_d = df_d[to_keep].rename(columns=to_rename)
    
    # Get companies from indirect interactions
    df_i = df_indirect.copy()
    to_group = [
        "COMPANY",
    ]
    to_agg = {
        "INTERACTION_SCORE": "sum"
    }
    to_rename = {
        "INTERACTION_SCORE": "INDIRECT_INTERACTIONS"
    }
    df_i = df_i.groupby(to_group, as_index=False).agg(to_agg).rename(columns=to_rename)
    
    # Concat dfs
    fillna = {
        "LINKEDIN_URL": "TBD",
        "DIRECT_INTERACTIONS": 0,
        "INDIRECT_INTERACTIONS": 0
    }
    df = pd.concat([df_d, df_i, df_leads_companies.drop(meta_columns, axis=1)]).fillna(fillna).drop_duplicates("COMPANY")
    df["INTERACTION_SCORE"] = df["DIRECT_INTERACTIONS"] * 5 + df["INDIRECT_INTERACTIONS"]
    df = df.sort_values(by=["INTERACTION_SCORE"], ascending=[False])
    df = df[~df["COMPANY"].isin(["NA", "TBD", "None", "n/a", 'UNKNOWN'])].reset_index(drop=True)

    # Merge to get meta data
    df = pd.merge(df.drop("LINKEDIN_URL", axis=1), ref, how="left").fillna("TBD")
    
    # Save database
    pdump(output_dir, df, "db_companies")
    return df.reset_index(drop=True)

db_companies = create_db_leads(
    df_leads_companies,
    df_interactions_d,
    df_leads,
    output_dir
)
print("- New database leads companies:", len(db_companies))
db_companies.head(3)

### Get companies from CRM

In [None]:
df_companies = gsheet.connect(spreadsheet_url).get(sheet_name=ref_companies_name)
if not isinstance(df_companies, pd.DataFrame):
    df_companies = pd.DataFrame()
print("CRM Companies:", len(df_companies))
# df_companies.head(3)

### Enrich leads companies with OpenAI and LinkedIn

In [None]:
def get_linkedin_url(company, companies_urls):
    # Init linkedinbio
    linkedinbio = "NA"
    
    # Create query
    if company not in companies_urls:
        query = f"{company.replace(' ', '+')}+LinkedIn+company"
        print("Google query: ", query)

        # Search in Google
        for i in search(query, tld="com", num=10, stop=10, pause=2):
            pattern = "https:\/\/.+.linkedin.com\/company\/.([^?])+"
            result = re.search(pattern, i)

            # Return value if result is not None
            if result != None:
                linkedinbio = result.group(0).replace(" ", "")
                companies_urls[company] = linkedinbio
                time.sleep(2)
                break
    else:
        linkedinbio = companies_urls.get(company)
    pdump(output_dir, companies_urls, "companies_urls")
    return linkedinbio

def enrich_leads_companies(
    df_init,
    ref,
    output_dir,
):
    # Init
    df = df_init.copy()

    # Add LinkedIn URL column and get existing urls
    if not "LINKEDIN_URL" in df.columns:
        df["LINKEDIN_URL"] = "TBD"
        companies_urls = {}
    else:
        companies_urls = pload(output_dir, "companies_urls")
        if companies_urls is None:
            companies_urls = df[df["LINKEDIN_URL"] != "TBD"].set_index('COMPANY')['LINKEDIN_URL'].to_dict()
            pdump(output_dir, companies_urls, "companies_urls")
            
    # Add CRM company column and match if company name exists
    if not "CRM_COMPANY" in df.columns:
        df["CRM_COMPANY"] = "TBD"
        crm_companies = {}
    else:
        crm_companies = pload(output_dir, "crm_companies")
        if crm_companies is None:
            crm_companies = df[df["CRM_COMPANY"] != "TBD"].set_index('COMPANY')['CRM_COMPANY'].to_dict()
            pdump(output_dir, crm_companies, "crm_companies")
            
    # Add company ID column
    if not "COMPANY_ID" in df.columns:
        df["COMPANY_ID"] = "TBD"
    
    # Clean ref
    ref["COMPANY_CLEAN"] = ref.apply(lambda row: str(row["COMPANY"]).lower().split(",")[0].replace("inc", "").replace("l.l.c", ""), axis=1)
    
    # Loop on companies
    call_linkedin = 0
    limit_linkedin = 30
    for row in df.itertuples():
        index = row.Index
        company = row.COMPANY
        linkedin_url = row.LINKEDIN_URL
        interaction_score = row.INTERACTION_SCORE
        company_id = row.COMPANY_ID
        crm_company = row.CRM_COMPANY

        # Update ICP and Company from OpenAI
        if linkedin_url == "TBD":
            print(f"🤖 Google Search - Finding LinkedIn URL for '{company}' ...")
            linkedin_url =  get_linkedin_url(company, companies_urls)
            df.loc[index, "LINKEDIN_URL"] = linkedin_url
            print("- LinkedIn URL:", linkedin_url)
            print()
        
        if "company" in linkedin_url and interaction_score >= 3 and company_id == "TBD" and call_linkedin < limit_linkedin:
            print(f"🕸️ LinkedIn - Enrich data for '{company}' (interaction score: {interaction_score})")
            print(linkedin_url)
            public_id = linkedin_url.split("/")[-1]
            tmp_df = pload(output_dir, f"{public_id}_linkedin_company")
            if tmp_df is None:
                try:
                    tmp_df = linkedin.connect(li_at, JSESSIONID).company.get_info(linkedin_url)
                    pdump(output_dir, tmp_df, f"{public_id}_linkedin_company")
                    time.sleep(2)
                except Exception as e:
                    print(e)
                    tmp_df = pd.DataFrame()
                
            if len(tmp_df) > 0:                
                df.loc[index, "COMPANY_ID"] = tmp_df.loc[0, "COMPANY_ID"]
                df.loc[index, "COMPANY_NAME"] = tmp_df.loc[0, "COMPANY_NAME"]
                df.loc[index, "COMPANY_URL"] = tmp_df.loc[0, "COMPANY_URL"]
                df.loc[index, "INDUSTRY"] = tmp_df.loc[0, "INDUSTRY"]
                df.loc[index, "STAFF_COUNT"] = tmp_df.loc[0, "STAFF_COUNT"]
                df.loc[index, "STAFF_RANGE"] = tmp_df.loc[0, "STAFF_RANGE"]
                df.loc[index, "FOLLOWER_COUNT"] = tmp_df.loc[0, "FOLLOWER_COUNT"]
                df.loc[index, "COUNTRY"] = tmp_df.loc[0, "COUNTRY"]
                df.loc[index, "CITY"] = tmp_df.loc[0, "CITY"]
                df.loc[index, "WEBSITE"] = tmp_df.loc[0, "WEBSITE"]
                df.loc[index, "TAGLINE"] = tmp_df.loc[0, "TAGLINE"]
                df.loc[index, "DESCRIPTION"] = tmp_df.loc[0, "DESCRIPTION"]
            else:
                df.loc[index, "COMPANY_ID"] = "UNKNOWN"
            call_linkedin += 1
            if call_linkedin >= limit_linkedin:
                print("🛑 Call LinkedIn reached:", limit_linkedin)
            else:
                print("- ⚠️ LinkedIn call:", call_linkedin)
            print()
                
        # Find companies in CRM
        if len(ref) > 0 and crm_company == "TBD":
            company = company.replace(".", "")
            print(f"💛 Sequence Matcher - Finding if '{company}' is in CRM ...")
            crm_company = find_crm_match(
                ref,
                "COMPANY_CLEAN",
                company
            )
            df.loc[index, "CRM_COMPANY"] = crm_company
            print("- CRM company:", crm_company)
            crm_companies[company] = crm_company
            pdump(output_dir, crm_companies, "crm_companies")
            print()
            
    df.STAFF_RANGE = df.STAFF_RANGE.str.replace("-None", ">")
    df = df.replace("nan", "NA").replace("na", "NA").replace("None", "NA")
    if "STAFF_RANGE" in df:
        staff_maping = {
            "NA": "NA",
            "UNKNOWN": "NA",
            "TBD": "NA",
            "0-1": "Solopreneur",
            "2-10":	"Micro Team",
            "11-50": "Small Company",
            "51-200": "Medium Company",
            "201-500": "Large Company",
            "501-1000":	"Enterprise Level",
            "1001-5000": "Major Corporation",
            "5001-10000": "Global Corporation",
            "10001>": "Mega Corporation",
        }
        df["STAFF_RANGE_NAME"] = df["STAFF_RANGE"].map(staff_maping)
    to_order = [
        'COMPANY',
        'INDUSTRY',
        'CITY',
        'COUNTRY',
        'STAFF_RANGE_NAME',
        'CRM_COMPANY',
        'INTERACTION_SCORE',
        'DIRECT_INTERACTIONS',
        'INDIRECT_INTERACTIONS',
        'STAFF_RANGE',
        'STAFF_COUNT',
        'FOLLOWER_COUNT',
        'WEBSITE',
        'TAGLINE',
        'DESCRIPTION',
        'COMPANY_ID',
        'COMPANY_NAME',
        'COMPANY_URL',
        'LINKEDIN_URL',
    ]
    df = df[to_order]
    return df.reset_index(drop=True)

df_leads_companies_update = enrich_leads_companies(
    db_companies,
    df_companies,
    output_dir
)
print("Companies:", len(df_leads_companies_update))
df_leads_companies_update.head(5)

## Output

### Save data

In [None]:
pdump(output_dir, df_leads_companies_update, file_leads_companies)

### Send "Companies" to spreadsheet

In [None]:
gsheet.connect(spreadsheet_url).send(data=df_leads_companies_update, sheet_name=leads_companies_name, append=False)