<img width="8%" alt="Naas.png" src="https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/.github/assets/logos/Naas.png" style="border-radius: 15%">

# Pipeline

**Tags:** #naas #pipeline #jupyter #notebook #dataanalysis #workflow #streamline

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel)

**Description:** This notebook creates a notebooks pipeline.

## Input

### Import libraries

In [1]:
from naas.pipeline import (
    Pipeline,
    NotebookStep,
    End,
    ParallelStep,
)
import naas_data_product
import naas
from naas_drivers import gsheet, linkedin
import glob
import os
from unidecode import unidecode

✅ utils file '/home/ftp/abi/utils/data.ipynb' successfully loaded.
✅ utils file '/home/ftp/abi/utils/llm.ipynb' successfully loaded.
✅ utils file '/home/ftp/abi/utils/naas_chat_plugin.ipynb' successfully loaded.
✅ utils file '/home/ftp/abi/utils/naas_lab.ipynb' successfully loaded.


### Setup variables
**Inputs**
- `abi_spreadsheet`: Google Sheets spreadsheet URL
- `sheet_entity`: Entity sheet name that stores all your personal data

**Outputs**
- `datalake_dir`: Datalake directory

In [2]:
# Inputs
abi_spreadsheet = naas.secret.get("ABI_SPREADSHEET")
sheet_entity = "ENTITY"
long_lived_token = naas.secret.get('NAAS_API_TOKEN')

# Outputs
datalake_dir = os.path.join("/", "home", "ftp", "abi", "outputs")

## Model

### Get data from Google Sheet spreadsheet

In [3]:
df_gsheet = gsheet.connect(abi_spreadsheet).get(sheet_name=sheet_entity).fillna("NA")
df_gsheet.head(1)

Unnamed: 0,ENTITY,LINKEDIN_URL,LINKEDIN_LI_AT,LINKEDIN_JSESSIONID,EMAILS,PROMPT_SALES_MESSAGINGS,ICP_SENIORITY,ICP_DEPARTMENT,ICP_ORG_STAFF_RANGE,ICP_ORG_INDUSTRY
0,Jérémy Ravenel,https://www.linkedin.com/in/jeremyravenel/,,,florent@naas.ai,Create 3 messaging options to engage conversat...,"Professional/Staff, Senior Professional/Staff,...","Human Resources (HR), Finance, Marketing, Sale...","Solopreneur, Micro Team, Small Company, Medium...",


### Run pipeline

In [7]:
for row in df_gsheet.itertuples():
    index = row.Index
    entity_name = row.ENTITY
    emails = row.EMAILS
    linkedin_url = row.LINKEDIN_URL
    li_at = row.LINKEDIN_LI_AT
    JSESSIONID = row.LINKEDIN_JSESSIONID
    print("- Entity:", entity_name)
    print("- Emails:", emails)
    print("- LinkedIn URL:", linkedin_url)
    print("- LinkedIn li_at:", li_at)
    print("- LinkedIn JSESSIONID:", JSESSIONID)
    entity_code = unidecode(row.ENTITY.lower().replace(" ", "_"))
    entity_dir = os.path.join(datalake_dir, entity_code)
    print("- Directory:", entity_dir)
    print()
    
    # Save entity data
    print("- Saving dependencies:")
    output_dir = os.path.join(datalake_dir, "entities", str(index))
    pdump(output_dir, abi_spreadsheet, "abi_spreadsheet")
    naas.dependency.add(os.path.join(output_dir, "abi_spreadsheet.pickle"))
    pdump(output_dir, entity_name, "entity_name")
    naas.dependency.add(os.path.join(output_dir, "entity_name.pickle"))
    pdump(output_dir, emails, "emails")
    naas.dependency.add(os.path.join(output_dir, "emails.pickle"))
    pdump(output_dir, linkedin_url, "linkedin_url")
    naas.dependency.add(os.path.join(output_dir, "linkedin_url.pickle"))
    pdump(output_dir, entity_dir, "entity_dir")
    naas.dependency.add(os.path.join(output_dir, "entity_dir.pickle"))
    
    # Set timezone: Timezone will be defined from the first profile region's and country's
    if index == 0:
        timezone = pload(output_dir, "timezone")
        if timezone is None:
            linkedin_dir = os.path.join(datalake_dir, "datalake", "linkedin", "profiles")
            df = get_linkedin_data(linkedin_url, linkedin_dir, "identity")
            region = df.loc[0, "REGION"]
            country = df.loc[0, "COUNTRY"]
            print("- Region:", region)
            print("- Country:", country)
            timezone = create_chat_completion(
                long_lived_token,
                prompt="Find timezone in the format 'Region/City'. If there is no exact match, please return a subjective answer based on the data you received",
                message=f"Region: {region}, Country: {country}",
            )
            print("- Timezone:", timezone)
            print()
            naas.set_remote_timezone(timezone)
            pdump(output_dir, timezone, "timezone")
            naas.dependency.add(os.path.join(output_dir, "timezone.pickle"))

    # Save secrets
    for x in ["LINKEDIN_LI_AT", "LINKEDIN_JSESSIONID"]:
        value = df_gsheet.loc[index, x]
        if value != "NA":
            print(f"Secret '{x}' to be added:")
            naas.secret.add(f"{x}_{entity_code.upper()}", value)
            if index == 0:
                naas.secret.add(x, value)
    if li_at == "NA":
        li_at = naas.secret.get("LINKEDIN_LI_AT")
    if JSESSIONID == "NA":
        JSESSIONID = naas.secret.get("LINKEDIN_JSESSIONID")
    print()
    
    # Create notebook steps
    content = NotebookStep(
        name="📲 Content",
        notebook_path=os.path.join(naas_data_product.MODELS_PATH, "content-engine", "__pipeline__.ipynb"),
        parameters={
            "datalake_dir": datalake_dir,
            "spreadsheet_url": abi_spreadsheet,
            "entity_name": entity_name,
            "emails": emails,
            "linkedin_url": linkedin_url,
            "li_at": li_at,
            "JSESSIONID": JSESSIONID,
            "entity_dir": entity_dir
        }
    )
    growth = NotebookStep(
        name="Growth",
        notebook_path=os.path.join(naas_data_product.MODELS_PATH, "growth-engine", "__pipeline__.ipynb"),
        parameters={
            "datalake_dir": datalake_dir,
            "spreadsheet_url": abi_spreadsheet,
            "entity_name": entity_name,
            "emails": emails,
            "linkedin_url": linkedin_url,
            "li_at": li_at,
            "JSESSIONID": JSESSIONID,
            "entity_dir": entity_dir
        }
    )
    sales = NotebookStep(
        name="Sales",
        notebook_path=os.path.join(naas_data_product.MODELS_PATH, "sales-engine", "__pipeline__.ipynb"),
        parameters={
            "datalake_dir": datalake_dir,
            "spreadsheet_url": abi_spreadsheet,
            "entity_name": entity_name,
            "emails": emails,
            "linkedin_url": linkedin_url,
            "li_at": li_at,
            "JSESSIONID": JSESSIONID,
            "entity_dir": entity_dir
        }
    )
    # Run Pipeline
    pipeline = Pipeline()
    pipeline >> content >> growth >> sales >> End()
    pipeline.run(outputs_path=os.path.join(entity_dir, "pipeline_executions"))

## Output

### Schedule pipeline

In [5]:
# # Init
# engine_name = "content-engine"
# engine_model_dir = os.path.join(naas_data_product.MODELS_PATH, engine_name)

# # Schedule pipeline
# file_path = os.path.join(engine_model_dir, "__pipeline__.ipynb")
# cron = "0 8 * * *"
# print("⏰ Scheduler:", file_path)
# naas.scheduler.add(cron=cron)
# print()

# # Uncomment the line below to delete your automation
# # naas.scheduler.delete(file_path)

# # Add dependencies
# files = glob.glob(os.path.join(engine_model_dir, "pipeline_templates", "*.ipynb")) + [os.path.join(engine_model_dir, "__plugin__.ipynb")]
# for file in files:
#     print("🔗 Dependency:", file)
#     naas.dependency.add(file)
#     print()
                  
#     # Uncomment the line below to delete your dependencies
#     # naas.dependency.delete(file)

### Add utils to dependencies

In [6]:
# import naas
# import glob
# import naas_data_product

# files = glob.glob(f"{naas_data_product.UTILS_PATH}/*.ipynb")

# for file in files: 
#     naas.dependency.add(file)
    
# #     # Uncomment the line below to delete your dependencies
# #     naas.dependency.delete(file)