<img width="8%" alt="Google Sheets.png" src="https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/.github/assets/logos/Google%20Sheets.png" style="border-radius: 15%">

# Google Sheets - Update Referentials

**Tags:** #googlesheets #gsheet #data #naas_drivers #operations #snippet

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel/)

**Description:** This notebook allows to send data to Google Sheets to a Google Sheets spreadsheet.

## Input

### Import libraries

In [None]:
from naas_drivers import gsheet
import pandas as pd
import os
from datetime import date
import naas_data_product
try:
    import openai
except:
    !pip install openai --user
    import openai
import time
try:
    from googlesearch import search
except:
    !pip install google
    from googlesearch import search
import re

### Setup variables
**Inputs**
- `input_dir`: Input directory to retrieve file from.
- `file_reactions`: Name of the file with reactions to be retrieved.
- `file_comments`: Name of the file with comments to be retrieved.

**Outputs**
- `spreadsheet_url`: Google Sheets spreadsheet URL.
- `sheet_name`: Google Sheets sheet name.
- `append`: If False, data will be canceled and replaced.

In [None]:
# Inputs
input_dir = os.path.join(naas_data_product.OUTPUTS_PATH, "growth-engine", date.today().isoformat())
file_name = "linkedin_interactions"
openai_api_key = naas.secret.get("OPENAI_API_KEY")

# Outputs
spreadsheet_url = naas.secret.get("ABI_SPREADSHEET") or "YOUR_GOOGLE_SPREADSHEET_URL"
ref_contact_name = "REF_CONTACT"
ref_company_name = "REF_COMPANY"
append = False

## Model

### Get BDD interactions

In [None]:
df_interactions = pload(input_dir, file_name)    
print('🗂️ Interactions:', len(df_interactions))
df_interactions.head(1)

### Get data "REF_CONTACT" from Google Sheets spreadsheet

In [None]:
ref_contact = gsheet.connect(spreadsheet_url).get(sheet_name=ref_contact_name)
print("Ref. contact:", len(ref_contact))
ref_contact.head(5)

### Get data "REF_COMPANY" from Google Sheets spreadsheet

In [None]:
ref_company = gsheet.connect(spreadsheet_url).get(sheet_name=ref_company_name)
print("Ref. company:", len(ref_company))
ref_company.head(1)

### Get interactions unique profiles

In [None]:
# Init
df_profiles = df_interactions.copy()

# Groupby
to_group = [
    "FULLNAME",
    "OCCUPATION",
    "PROFILE_URL",
]
to_agg = {
    "INTERACTION_SCORE": "sum"
}
df_profiles = df_profiles.groupby(to_group, as_index=False).agg(to_agg)
print("Profiles:", len(df_profiles))
df_profiles.head(1)

### Update Ref Contact

In [None]:
def create_chat_completion(
    openai_api_key,
    prompt,
    message
):
    # Init
    openai.api_key = openai_api_key
    
    # Get response
    response = openai.ChatCompletion.create(
        model="gpt-4",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": prompt
            },
            {
                "role": "user",
                "content": message
            }
        ]
    )
    return response['choices'][0]['message']['content'].replace("'", "")

In [None]:
prompt_company = f"""
I will give you the occupation from a profile I get from LinkedIn, you will return the current company he is working for.
If you don't find it in the occupation return "TBD"
Don't put the results into quotes.
"""

prompt_icp = f"""
I have 2 ideal customer profile, one is a 'data producer' with basic knowledge of Python that could use our Notebook templates to create plugins. 
These plugions are then distributed data via our NaasAI Chat interface.
The other one is a 'data consummer' that will enjoy using NaasAI Chat for its basic LLMs integration but also interested in having its own data available, hence work with the data producer. 
I will give you the occupation from a profile I get from LinkedIn, you will return stricly and only one of the following values inside the simple quotes based on the best match 'DataProducer', 'DataConsummer', 'NotICP' or 'Don't know' if you don't find a plausible match with the first 3 values.
Don't put the results into quotes.
"""

def update_ref_contact(
    df_init,
    ref_contact,
    input_dir
):
    # Init
    df = df_init.copy()

    # Filter on contact
    df = df[df["PROFILE_URL"].str.contains("https://www.linkedin.com/in/.+")]
    df = df.sort_values(by=["INTERACTION_SCORE"], ascending=[False]).reset_index(drop=True)
    df = pd.concat([ref_contact, df]).drop_duplicates("PROFILE_URL")
    if not "ICP" in df.columns:
        df["ICP"] = "UNKNOWN"
    if not "COMPANY" in df.columns:
        df["COMPANY"] = "UNKNOWN"
    
    # Find ICP and Company
    for row in df.itertuples():
        index = row.Index
        fullname = row.FULLNAME
        occupation = row.OCCUPATION
        icp = row.ICP
        company = row.COMPANY
        if icp == "UNKNOWN" and icp == "UNKNOWN":
            print(f"{index} - Starting with '{fullname}': {occupation}")

            icp = create_chat_completion(openai_api_key, prompt_icp, occupation)
            print("ICP:", icp)
            time.sleep(2)
            company = create_chat_completion(openai_api_key, prompt_company, occupation)
            print("Current company:", company)

            df.loc[index, "ICP"] = icp
            df.loc[index, "COMPANY"] = company
            pdump(input_dir, df, "contacts")
            time.sleep(2)
    return df.reset_index(drop=True)
    
df_contact = update_ref_contact(
    df_profiles,
    ref_contact,
    input_dir,
)  
print("Contacts:", len(df_contact))
df_contact.head(5)

In [None]:
def get_linkedin_url(company):
    # Init linkedinbio
    linkedinbio = "UNKNOWN"

    # Create query
    query = f"{company.replace(' ', '+')}+LinkedIn+company"
    print("Google query: ", query)

    # Search in Google
    for i in search(query, tld="com", num=10, stop=10, pause=2):
        pattern = "https:\/\/.+.linkedin.com\/company\/.([^?])+"
        result = re.search(pattern, i)

        # Return value if result is not None
        if result != None:
            linkedinbio = result.group(0).replace(" ", "")
            time.sleep(2)
            return linkedinbio
    return linkedinbio

def update_ref_company(
    df_init,
    df_contact,
    ref_company,
    input_dir
):
    # Init
    df = df_init.copy()
            
    # Get companies from direct interactions
    df_company_d = df_init.copy()
    df_company_d = df_company_d[df_company_d["PROFILE_URL"].str.contains("https://www.linkedin.com/company/.+")]
    to_keep = [
        "FULLNAME",
        "PROFILE_URL",
        "INTERACTION_SCORE"
    ]
    to_rename = {
        "FULLNAME": "COMPANY_NAME",
        "PROFILE_URL": "LINKEDIN_URL",
        "INTERACTION_SCORE": "DIRECT_INTERACTIONS"
    }
    df_company_d = df_company_d[to_keep].rename(columns=to_rename)
    
    # Get companies from indirect interactions
    df_company_i = df_contact.copy()
    to_group = [
        "COMPANY",
    ]
    to_agg = {
        "INTERACTION_SCORE": "sum"
    }
    to_rename = {
        "COMPANY": "COMPANY_NAME",
        "INTERACTION_SCORE": "INDIRECT_INTERACTIONS"
    }
    df_company_i = df_company_i.groupby(to_group, as_index=False).agg(to_agg).rename(columns=to_rename)
    
    # Concat
    fillna = {
        "LINKEDIN_URL": "TBD",
        "DIRECT_INTERACTIONS": 0,
        "INDIRECT_INTERACTIONS": 0
    }
    df_company = pd.concat([ref_company, df_company_d, df_company_i]).drop_duplicates("COMPANY_NAME").fillna(fillna)
    df_company["INTERACTION_SCORE"] = df_company["DIRECT_INTERACTIONS"] + df_company["INDIRECT_INTERACTIONS"]
    df_company = df_company.sort_values(by=["INTERACTION_SCORE"], ascending=[False])
    df_company = df_company[df_company["COMPANY_NAME"] != "TBD"].reset_index(drop=True)
    for row in df_company.itertuples():
        index = row.Index
        company_name = row.COMPANY_NAME
        linkedin_url = row.LINKEDIN_URL
        if linkedin_url == "TBD":
            print(f"{index} - Starting with '{company_name}':")
            linkedin_url = get_linkedin_url(company_name)
            print("LinkedIn URL:", linkedin_url)
            df_company.loc[index, "LINKEDIN_URL"] = linkedin_url
            pdump(input_dir, df_company, "companies")
    return df_company.reset_index(drop=True)

df_company = update_ref_company(
    df_profiles,
    df_contact,
    ref_company,
    input_dir
)
print("Companies:", len(df_company))
df_company.head(5)

## Output

### Send "Contacts" to spreadsheet

In [None]:
gsheet.connect(spreadsheet_url).send(data=df_contact, sheet_name=ref_contact_name, append=False)

### Send "Companies" to spreadsheet

In [None]:
gsheet.connect(spreadsheet_url).send(data=df_company, sheet_name=ref_company_name, append=False)