<img width="8%" alt="Content" src="https://naasai-public.s3.eu-west-3.amazonaws.com/abi-demo/content_creation.png" style="border-radius: 15%">

# Content - Create POSTS database

**Tags:** #content #posts #database

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel/)

**Description:** This notebook generates OBG POSTS using the data extracted by the configured connections. Currently, it only supports LinkedIn posts.

## Input

### Import libraries

In [None]:
from naas_drivers import gsheet
import pandas as pd
import os
from datetime import date
import naas_data_product
import glob

### Setup variables
**Inputs**
- `entity_index`: Entity index.
- `entity_dir`: Entity directory.
- `entity_name`: Entity name.
- `input_dir`: Input directory to retrieve file from.
- `file_name`: Name of the file to be retrieved.

**Outputs**
- `spreadsheet_url`: Google Sheets spreadsheet URL.
- `sheet_name`: Google Sheets sheet name.
- `output_dir`: Output directory
- `file_content`: Name of the file to be saved in your local.

In [None]:
# Inputs
entity_index = "0"
entity_dir = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entities", entity_index), "entity_dir") or ""
entity_name = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entities", entity_index), "entity_name") or ""
input_dir = os.path.join(entity_dir, "content-engine", date.today().isoformat())
file_name = "linkedin_posts"
force_update = False
api_key = naas.secret.get('NAAS_API_TOKEN')

# Outputs
spreadsheet_url = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entities", entity_index), "abi_spreadsheet") or ""
sheet_name = "POSTS"
output_dir = os.path.join(entity_dir, "content-engine", date.today().isoformat())
file_content = "posts"
datalake_dir = naas.secret.get("ABI_DATALAKE_DIR")

## Model

### Get content

In [None]:
df_init = gsheet.connect(spreadsheet_url).get(sheet_name=sheet_name)
if not isinstance(df_init, pd.DataFrame):
    df_init = pd.DataFrame()
print("- Posts db (init):", len(df_init))
df_init.head(1)

### Get posts

In [None]:
def get_posts_data(
    input_dir,
    file_name,
    df_init,
    entity_dir,
    force_update
):
    # Init
    df = pd.DataFrame()
    
    # Get Historical data
    if 'ID' in df_init.columns and not force_update:
        df = pload(input_dir, file_name)
    else:
        files = sorted(glob.glob(os.path.join(entity_dir, "**", f"{file_name}*"), recursive=True), reverse=True)
        for file in files:
            file_dir = file.split(file_name)[0]
            tmp_df = pload(file_dir, file_name)
            if "ENTITY" not in tmp_df.columns:
                df = pd.concat([df, tmp_df])
        df = df.drop_duplicates("ACTIVITY_ID", keep='first')
    return df.reset_index(drop=True)

df_posts = get_posts_data(input_dir, file_name, df_init, entity_dir, force_update)
print("- New posts published:", len(df_posts))
df_posts.head(len(df_posts))

### Cleaning data

In [None]:
def create_db(
    df_new,
    df_init,
    entity_name
):
    # Prep data new posts
    df = df_new.copy()
    if len(df) > 0:
        # Format published date
        df["PUBLISHED_DATE"] = pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.tz_localize(pytz.timezone("Europe/Paris")).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z")
        df["DATE_EXTRACT"] = pd.to_datetime(df['DATE_EXTRACT'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.tz_localize(pytz.timezone("Europe/Paris")).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z")
        
        # Add "ENGAGEMENTS"
        df["ENGAGEMENTS"] = df["LIKES"] + df["COMMENTS"] + df["SHARES"]
        
        # Cleaning: if title is None and Content = 'Video (native)' -> "Live"
        df.loc[(df["TITLE"].astype(str) == 'None') & (df["CONTENT"] == 'Video (native)'), "TITLE"] = "Live"
        df.loc[df["TITLE"].astype(str) == 'Live', "TEXT"] = "Live"
        df.loc[(df["CONTENT"] == 'Article') & (df["TEXT"].astype(str) == 'None'), "TEXT"] = "Article: " + df["CONTENT_URL"]
        df.loc[(df["CONTENT"] == 'Article') & (df["TITLE"].astype(str) == 'None'), "TITLE"] = "Article: " + df["CONTENT_URL"]
        df.loc[(df["CONTENT"] == 'Article') & (df["TEXT"].astype(str) != 'None'), "TEXT"] = df["TEXT"].astype(str) + "\nArticle: " + df["CONTENT_URL"]
        
        # Cleaning: rename columns
        to_rename = {
            "POST_URL": "URL",
            "CHARACTER_COUNT": "LENGTH",
            "ACTIVITY_ID": "ID",
            "PROFILE_MENTION": "PEOPLE_MENTIONED",
            "COMPANY_MENTION": "ORGANIZATION_MENTIONED",
            "LINKS": "LINKEDIN_LINKS",
            "IMAGE_URL": "IMAGE_SHARED",
            "CONTENT": "TYPE",
            "CONTENT_TITLE": "CONTENT_TITLE_SHARED",
            "CONTENT_URL": "CONTENT_URL_SHARED",
        }
        df = df.rename(columns=to_rename)
        df = df.dropna(subset=["ID"])
        to_drop = [
            "ENTITY",
            "SCENARIO",
            "SOURCE",
            "DATE",
            "TIME"
        ]
        for x in to_drop:
            if x in df.columns:
                df = df.drop(x, axis=1)
        df["ID"] = df.apply(lambda row: create_sha_256_hash(row["ID"]), axis=1)
        df["ORGANIZATION_MENTIONED"] = df.apply(lambda row: str(row["ORGANIZATION_MENTIONED"]).replace("[]", "NA"), axis=1)
        df["LINKEDIN_LINKS"] = df.apply(lambda row: str(row["LINKEDIN_LINKS"]).replace("[]", "NA"), axis=1)
        df["TAGS"] = df.apply(lambda row: str(row["TAGS"]).replace("[]", "NA"), axis=1)
        df.insert(loc=0, column="ENTITY", value=entity_name)

        # Select
        to_select = [
            "ENTITY",
            "ID",
            "PUBLISHED_DATE",
            "TITLE",
            "TEXT",
            "VIEWS",
            "LIKES",
            "COMMENTS",
            "SHARES",
            "ENGAGEMENTS",
            "ENGAGEMENT_SCORE",
            "TYPE",
            'AUTHOR_NAME',
            'AUTHOR_URL',
            "LENGTH",
            "PEOPLE_MENTIONED",
            "ORGANIZATION_MENTIONED",
            "CONTENT_TITLE_SHARED",
            "CONTENT_URL_SHARED",
            "LINKEDIN_LINKS",
            "IMAGE_SHARED",
            "TAGS",
            "URL",
            "DATE_EXTRACT"
        ]
        df = df[to_select]

        # Add new data
        df.insert(loc=1, column="SCENARIO", value=pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime("W%W-%Y"))
        df.insert(loc=2, column="SOURCE", value="LinkedIn")
        df.insert(loc=5, column="DATE", value=pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime("%a. %d %b."))
        df.insert(loc=6, column="TIME", value=pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime('%HH%M'))

        # Manage empty title
        df.loc[df.TITLE == "", "TITLE"] = df["TEXT"]
        df["TITLE"] = df.apply(lambda row: row["TITLE"].split("\n")[1] if row["TITLE"].startswith("\n") else row["TITLE"], axis=1)
        df.loc[df.TITLE.str[:2] == "\n", "TITLE"] = df["TEXT"]
        
    # Prep data init
    to_rename = {
        "CONTENT": "TEXT",
        "CONTENT_LENGTH": "LENGTH",
        "CONTENT_URL": "URL",
        "KEYWORDS": "TAGS",
    }
    df_init = df_init.rename(columns=to_rename)
    if len(df_init) > 0 and "ID" not in df_init.columns and "URL" in df_init.columns:
        df_init["ID"] = df_init.apply(lambda row: create_sha_256_hash(str(row["URL"].split(":activity:")[1].split("/")[0])), axis=1)
        to_create = [
            "PEOPLE_MENTIONED",
            "ORGANIZATION_MENTIONED",
            "CONTENT_TITLE_SHARED",
            "CONTENT_URL_SHARED",
            "LINKEDIN_LINKS",
            "IMAGE_SHARED",
            "TYPE",
        ]
        for x in to_create:
            df_init[x] = "NA"
            if x == "TYPE":
                df_init[x] = "Text"
        tmp_df = df[(df["ENGAGEMENTS"] + df["VIEWS"]).astype(int) > 0].reset_index(drop=True)
        df_init["AUTHOR_NAME"] = tmp_df.loc[0, "AUTHOR_NAME"]
        df_init["AUTHOR_URL"] = tmp_df.loc[0, "AUTHOR_URL"]
        df_init["ENGAGEMENTS"] = df_init["LIKES"] + df_init["COMMENTS"] + df_init["SHARES"]
        
    df_init["ID"] = df_init.apply(lambda row: create_sha_256_hash(str(row["URL"].split(":activity:")[1].split("/")[0])), axis=1) 
    if len(df_init) > 0:
        # Get meta data from existing people
        col_ref = [
            "ID",
            "CONCEPT",
            "SENTIMENT",
            "TARGET",
            "OBJECTIVE",
        ]
        for c in col_ref:
            # If columns does not exist, init value to be determined (TBD)
            if not c in df_init.columns:
                df_init[c] = "TBD"
        ref = df_init[col_ref]
        
    # Merge to get meta data
    df = pd.merge(df, ref, on="ID", how="left")
    for c in col_ref:
        df[c] = df[c].fillna("TBD")

    # Concat new posts with init
    df = pd.concat([df, df_init], axis=0).reset_index(drop=True)
    
    # Clean final dataframe
    if len(df) > 0:
        # Drop duplicates
        df = df.drop_duplicates("URL", keep='first')
                
        # Clean str columns
        to_clean = [
            "PEOPLE_MENTIONED",
            "ORGANIZATION_MENTIONED",
            "CONTENT_TITLE_SHARED",
            "CONTENT_URL_SHARED",
            "LINKEDIN_LINKS",
            "IMAGE_SHARED",
            "TAGS",
        ]
        for x in to_clean:
            df[x] = df[x].astype(str).str.replace("None", "NA")
        
        # Order
        to_select = [
            "ENTITY",
            "SCENARIO",
            "SOURCE",
            "PUBLISHED_DATE",
            "DATE",
            "TIME",
            "ID",
            "TITLE",
            "TEXT",
            "CONCEPT",
            "SENTIMENT",
            "TARGET",
            "OBJECTIVE",
            "VIEWS",
            "LIKES",
            "COMMENTS",
            "SHARES",
            "ENGAGEMENTS",
            "ENGAGEMENT_SCORE",
            "TYPE",
            'AUTHOR_NAME',
            'AUTHOR_URL',
            "LENGTH",
            "PEOPLE_MENTIONED",
            "ORGANIZATION_MENTIONED",
            "CONTENT_TITLE_SHARED",
            "CONTENT_URL_SHARED",
            "LINKEDIN_LINKS",
            "IMAGE_SHARED",
            "TAGS",
            "URL",
            "DATE_EXTRACT"
        ]
        df = df[to_select]
            
        # Sort values
        df["SCENARIO_ORDER"] = pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime("%Y%W")
        df = df.sort_values(by=["PUBLISHED_DATE", "ENTITY"], ascending=[False, True])
    return df.reset_index(drop=True)
    
db_content = create_db(df_posts, df_init, entity_name)
print("- Post db:", len(db_content))
db_content.head(len(df_posts))

### Enrich Content with "Concept", "Sentiment", "Objective" and "Target"

In [None]:
def enrich_content(
    df_init,
    api_key,
    output_dir,
):
    # Init
    df = df_init.copy()
    
    # Filter data
    filter_df = df[
        (df["CONCEPT"].isin(["TBD"])) |
        (df["SENTIMENT"].isin(["TBD"])) |
        (df["OBJECTIVE"].isin(["TBD"])) |
        (df["TARGET"].isin(["TBD"]))
    ]
    print("-> Content to be updated:", len(filter_df))

    # Get Concept
    content_concept = get_dict_from_df(df, "CONCEPT", "ID", "content_concept", output_dir)
    
    # Get Sentiment
    content_sentiment = get_dict_from_df(df, "SENTIMENT", "ID", "content_sentiment", output_dir)

    # Get Objective
    content_objective = get_dict_from_df(df, "OBJECTIVE", "ID", "content_objective", output_dir)
    
    # Get Target
    content_target = get_dict_from_df(df, "TARGET", "ID", "content_target", output_dir)
    
    # Prompts
    concept_definition = """
    Concept refers to abstract or general ideas derived from specific instances or occurrences.
    Concept are not People, Organization or Product.
    It can represent theories, ideas, thoughts, or principles that are used to explain or interpret information. 
    Identify as many as possible
    """

    sentiment_definition = """
    Sentiment represents the emotional tone or attitude expressed in a content or in a piece of content to understand the feelings or opinions towards a particular subject.
    It could be:
    - "Praise": Highly positive that expresses admiration or approval. This sentiment often includes compliments or positive feedback.
    - "Supportive": Positive that may not necessarily contain high praise but show agreement, support, or encouragement.
    - "Neutral": Neither positive nor negative, often factual statements or questions without any clear positive or negative connotations.
    - "Constructive": May seem negative but are intended to provide constructive feedback or suggest improvements.
    - "Disapproving": Express disagreement, criticism, or negative feedback.
    Identify as many as possible from the list above
    """

    target_definition = """
    Targets are Professional Role targeted by the content and classified as follows:
    - "Entry-Level": Any occupation with Intern/Internship, Trainee, Junior
    - "Professional/Staff": [Role] Specialist, [Role] Analyst, [Role] Coordinator.
    - "Senior Professional/Staff": Senior [Role] Specialist, Senior [Role] Analyst.
    - "Lead/Supervisor": Team Lead, Supervisor.
    - "Manager": Manager, [Department] Manager.
    - "Senior Manager": Senior Manager, Director.
    - "Executive": Vice President, Chief [Role] Officer (CFO, CTO, etc.).
    - "Top Executive": President, CEO, Managing Director.
    Identify as many as possible from the list above
    """

    objective_definition = """
    Ojbective represents the goals or purposes that the content aims to achieve. 
    It could be:
    - "Brand Awareness": Increase visibility and recognition of a brand or company.
    - "Product Promotion": Promote a specific product or service.
    - "Engagement": Engage with the audience, encouraging likes, comments, shares, or other forms of interaction.
    - "Education": Educate the audience about a certain topic, industry trend, or useful information.
    - "Lead Generation": Attract potential customers and collect their contact information.
    - "Customer Retention": Maintain relationships with existing customers, keeping them interested and loyal.
    - "Reputation Management": Manage the brand's reputation, either by addressing customer issues or sharing positive news.
    - "Event Promotion": Promote an upcoming event.
    - "Driving Traffic": Drive traffic to a website or blog.
    - "Community Building": Foster a sense of community among followers or customers.
    Identify as many as possible from the list above
    """

    system_msg = "You are a helpful IT-project and account management expert who extracts information from documents. In this case, document is a LinkedIn post."
    content_prompt = """
    From the text below, extract the following Entitie described in the mentioned format
    0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
    1. Look for these Entity types in the text and generate as comma-separated format similar to entity type.
        Do not create new entity types that aren't mentioned below:
        Entity Types:
        label:'Concept',name:string;summary:string //[concept_definition]
        label:'Sentiment',name:string;summary:string //[sentiment_definition]
        label:'Target',name:string;summary:string //[target_definition]
        label:'Objective',name:string;summary:string //[objective_definition]

    2. The output should look like :
    {
        "entities": [{"label":"Concept","name":string,"summary":string}],
    }
    
    Text:
    [text]
    """
    content_prompt = content_prompt.replace("[concept_definition]", concept_definition)
    content_prompt = content_prompt.replace("[sentiment_definition]", sentiment_definition)
    content_prompt = content_prompt.replace("[target_definition]", target_definition)
    content_prompt = content_prompt.replace("[objective_definition]", objective_definition)
    
    # Loop on profile
    count = 1
    for row in filter_df.itertuples():
        # Init values
        index = row.Index
        content_id = row.ID
        title = row.TITLE
        text = row.TEXT
        
        # Replace value in prompt
        prompt_msg = content_prompt
        prompt_msg = prompt_msg.replace("[text]", text)

        # Function to call the Naas Chat API
        print(f"🤖 Finding Concept, Sentiment, Target, Objective from posts: '{title}' ({content_id})")
        concept = []
        sentiment = []
        target = []
        objective = []
        try:
            result = create_naas_chat_completion(
                api_key,
                prompt=system_msg,
                message=prompt_msg,
            )
            res_json = json.loads(result)
            pdump(output_dir, res_json, f"kgd_content_{content_id}")
            entities = res_json.get("entities")
            print(entities)
            for e in entities:
                label = e.get("label")
                name = e.get("name")
                summary = e.get("summary")
                print(f'- {label}\n{name}: {summary}')
                if label == "Concept":
                    concept.append(f"{name}: {summary}")
                elif label == "Sentiment":
                    sentiment.append(f"{name}: {summary}")
                elif label == "Target":
                    target.append(f"{name}: {summary}")
                elif label == "Objective":
                    objective.append(f"{name}: {summary}")
        except Exception as e:
            print(e)
        df.loc[index, "CONCEPT"] = "|".join(concept) if len(concept) > 0 else "NA"
        df.loc[index, "SENTIMENT"] = "|".join(sentiment) if len(sentiment) > 0 else "NA"
        df.loc[index, "TARGET"] = "|".join(target) if len(target) > 0 else "NA"
        df.loc[index, "OBJECTIVE"] = "|".join(objective) if len(objective) > 0 else "NA"
        print()
    return df.reset_index(drop=True)
    
df_content = enrich_content(
    db_content,
    api_key,
    output_dir,
)
# df_content.head(1)

## Output

### Save data

In [None]:
pdump(output_dir, df_content, file_content)

### Send "Posts" to Google Sheets spreadsheet

In [None]:
send_data_to_gsheet(df_content, df_init, spreadsheet_url, sheet_name)

### Save table data

In [None]:
save_data(df_content, datalake_dir, entity_name, file_content)