<img width="8%" alt="Naas.png" src="https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/.github/assets/logos/Naas.png" style="border-radius: 15%">

# Pipeline

**Tags:** #naas #pipeline #jupyter #notebook #dataanalysis #workflow #streamline

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel)

**Description:** This notebook creates a notebooks pipeline.

## Input

### Import libraries

In [None]:
from naas.pipeline import (
    Pipeline,
    NotebookStep,
    End,
    ParallelStep,
)
from datetime import date
import os
import glob
import naas
import naas_data_product

### Setup variables

In [None]:
# Datalake
datalake_dir = os.path.join("/", "home", "ftp", "abi", "outputs")

# LinkedIn
li_at = naas.secret.get("LINKEDIN_LI_AT") or "YOUR_LINKEDIN_LI_AT" #example: AQFAzQN_PLPR4wAAAXc-FCKmgiMit5FLdY1af3-2
JSESSIONID = naas.secret.get("LINKEDIN_JSESSIONID") or "YOUR_LINKEDIN_JSESSIONID" #example: ajax:8379907400220387585
linkedin_url = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entity"), "linkedin_url") or "YOUR_LINKEDIN_URL"

# Google Sheets
spreadsheet_url = naas.secret.get("ABI_SPREADSHEET") or "YOUR_GOOGLE_SPREADSHEET_URL"
sheet_content = "CONTENT"
sheet_interaction = "INTERACTIONS"
sheet_growth = "GROWTH"
sheet_leads = "LEADS"
sheet_leads_companies = "LEADS_COMPANIES"

# OpenAI API key
openai_api_key = naas.secret.get("OPENAI_API_KEY") or "YOUR_OPENAI_API_KEY"

# Notification
email_to = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entity"), "emails") or []

# Scheduler
cron = "30 8 * * *"

## Model

### Create output directory by date

In [None]:
growth_dir = os.path.join(datalake_dir, "growth-engine", date.today().isoformat())
print("✅ Output directory:", growth_dir)

### Setup notebooks
This section is made to declare all the notebooks and create unique IDs that will be used in the pipeline. 

In [None]:
extraction = NotebookStep(
    name="Get LinkedIn interactions",
    notebook_path="pipeline_templates/LinkedIn_Get_interactions_from_posts.ipynb",
    parameters={
        "spreadsheet_url": spreadsheet_url,
        "sheet_name": sheet_content,
        "li_at": li_at,
        "JSESSIONID": JSESSIONID,
        "output_dir": growth_dir
    }
)
cleaning = NotebookStep(
    name="Interactions db",
    notebook_path="pipeline_templates/Google_Sheets_Update_interactions_db.ipynb",
    parameters={
        "input_dir": growth_dir,
        "output_dir": growth_dir,
        "spreadsheet_url": spreadsheet_url,
        "sheet_name": sheet_interaction,
    }
)
growth = NotebookStep(
    name="Growth db",
    notebook_path="pipeline_templates/Google_Sheets_Update_growth_db.ipynb",
    parameters={
        "input_dir": growth_dir,
        "output_dir": growth_dir,
        "spreadsheet_url": spreadsheet_url,
        "sheet_name": sheet_growth,
    }
)
leads = NotebookStep(
    name="Leads",
    notebook_path="pipeline_templates/Google_Sheets_Update_leads.ipynb",
    parameters={
        "input_dir": growth_dir,
        "openai_api_key": openai_api_key,
        "li_at": li_at,
        "JSESSIONID": JSESSIONID,
        "spreadsheet_url": spreadsheet_url,
        "leads_profiles_name": sheet_leads,
        "output_dir": growth_dir,
    }
)
leads_companies = NotebookStep(
    name="Leads companies",
    notebook_path="pipeline_templates/Google_Sheets_Update_leads_companies.ipynb",
    parameters={
        "input_dir": growth_dir,
        "openai_api_key": openai_api_key,
        "li_at": li_at,
        "JSESSIONID": JSESSIONID,
        "spreadsheet_url": spreadsheet_url,
        "leads_companies_name": sheet_leads_companies,
        "leads_profiles_name": sheet_leads,
        "output_dir": growth_dir,
    }
)
analytics = NotebookStep(
    name="Create analytics",
    notebook_path="pipeline_templates/Plotly_Create_analytics.ipynb",
    parameters={
        "input_dir": growth_dir,
        "linkedin_url": linkedin_url,
        "output_dir": growth_dir,
    }
)
plugin = NotebookStep(
    name="Generate plugin",
    notebook_path="__plugin__.ipynb",
    parameters={
        "input_dir": growth_dir,
        "spreadsheet_url": spreadsheet_url,
        "sheet_name": sheet_content,
    }
)
email = NotebookStep(
    name="Send Email notification",
    notebook_path="pipeline_templates/Naas_Send_notification.ipynb",
    parameters={
        "input_dir": growth_dir,
        "datalake_dir": datalake_dir,
        "email_to": email_to,
    }
)

## Output

### Run pipeline

In [None]:
pipeline = Pipeline()

pipeline >> extraction >> cleaning >> growth >> leads >> leads_companies >> analytics >> plugin >> email >> End()

pipeline.run(outputs_path=os.path.join(growth_dir, "pipeline_executions"))

### Add dependencies

In [None]:
pipeline_templates = glob.glob("pipeline_templates/*.ipynb")
utils_files = glob.glob(f"{naas_data_product.UTILS_PATH}/*.ipynb")
files = pipeline_templates + utils_files + ["__plugin__.ipynb"]

for file in files: 
    naas.dependency.add(file)

### Add scheduler

In [None]:
naas.scheduler.add(cron=cron)

# Uncomment the line below to delete your automation
# naas.scheduler.delete()