<img width="8%" alt="Content" src="https://naasai-public.s3.eu-west-3.amazonaws.com/abi-demo/content_creation.png" style="border-radius: 15%">

# Content - Create POSTS database

**Tags:** #content #posts #database

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel/)

**Description:** This notebook generates OBG POSTS using the data extracted by the configured connections. Currently, it only supports LinkedIn posts.

## Input

### Import libraries

In [None]:
from naas_drivers import gsheet
import pandas as pd
import os
from datetime import date
import naas_data_product

### Setup variables
**Inputs**
- `entity_dir`: This variable represents the entity directory.
- `entity_name`: This variable holds the entity name.
- `input_dir`: Input directory to retrieve file from.
- `file_name`: Name of the file to be retrieved.

**Outputs**
- `spreadsheet_url`: Google Sheets spreadsheet URL.
- `sheet_name`: Google Sheets sheet name.
- `output_dir`: Output directory
- `file_content`: Name of the file to be saved in your local.

In [None]:
# Inputs
entity_index = "0"
entity_dir = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entities", entity_index), "entity_dir") or ""
entity_name = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entities", entity_index), "entity_name") or ""
input_dir = os.path.join(entity_dir, "content-engine", date.today().isoformat())
file_name = "linkedin_posts"

# Outputs
spreadsheet_url = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entities", entity_index), "abi_spreadsheet") or ""
sheet_name = "POSTS"
output_dir = os.path.join(entity_dir, "content-engine", date.today().isoformat())
file_content = "posts"

## Model

### Get content

In [None]:
df_init = gsheet.connect(spreadsheet_url).get(sheet_name=sheet_name)
if not isinstance(df_init, pd.DataFrame):
    df_init = pd.DataFrame()
print("- Posts db (init):", len(df_init))
df_init.head(1)

### Get posts

In [None]:
df_posts = pload(input_dir, file_name)    
print("- New posts published:", len(df_posts))
df_posts.head(len(df_posts))

### Cleaning data

In [None]:
def create_db(
    df_new,
    df_init,
    entity_name
):
    # Init
    df = df_new.copy()
    
    if len(df) > 0:
        # Format published date
        df["PUBLISHED_DATE"] = pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.tz_localize(pytz.timezone("Europe/Paris")).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z")
        df["DATE_EXTRACT"] = pd.to_datetime(df['DATE_EXTRACT'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.tz_localize(pytz.timezone("Europe/Paris")).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z")

        # Cleaning: if title is None and Content = 'Video (native)' -> "Live"
        df.loc[(df["TITLE"].astype(str) == 'None') & (df["CONTENT"] == 'Video (native)'), "TITLE"] = "Live"
        df.loc[df["TITLE"].astype(str) == 'Live', "TEXT"] = "Live"
        df.loc[(df["CONTENT"] == 'Article') & (df["TEXT"].astype(str) == 'None'), "TEXT"] = "Article: " + df["CONTENT_URL"]
        df.loc[(df["CONTENT"] == 'Article') & (df["TITLE"].astype(str) == 'None'), "TITLE"] = "Article: " + df["CONTENT_URL"]
        df.loc[(df["CONTENT"] == 'Article') & (df["TEXT"].astype(str) != 'None'), "TEXT"] = df["TEXT"].astype(str) + "\nArticle: " + df["CONTENT_URL"]

        # Cleaning: rename columns + None to NA
        to_rename = {
            "POST_URL": "CONTENT_URL",
            "TEXT": "CONTENT",
            "CHARACTER_COUNT": "CONTENT_LENGTH",
            "TAGS": "KEYWORDS",
        }
        df = df.drop(["CONTENT_URL", "CONTENT"], axis=1).rename(columns=to_rename)
        df.KEYWORDS = df.KEYWORDS.astype(str).str.replace("None", "NA")
        df.insert(loc=0, column="ENTITY", value=entity_name)

        # Select
        to_select = [
            "ENTITY",
            "PUBLISHED_DATE",
            "TITLE",
            "CONTENT",
            "CONTENT_LENGTH",
            "KEYWORDS",
            "VIEWS",
            "LIKES",
            "COMMENTS",
            "SHARES",
            "ENGAGEMENT_SCORE",
            "CONTENT_URL",
            "DATE_EXTRACT"
        ]
        df = df[to_select]

        # Add new data
        df.insert(loc=1, column="SCENARIO", value=pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime("W%W-%Y"))
        df.insert(loc=2, column="SOURCE", value="LinkedIn")
        df.insert(loc=4, column="DATE", value=pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime("%a. %d %b."))
        df.insert(loc=5, column="TIME", value=pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime('%HH%M'))

        # Manage empty title
        df.loc[df.TITLE == "", "TITLE"] = df["CONTENT"]
        df["TITLE"] = df.apply(lambda row: row["TITLE"].split("\n")[1] if row["TITLE"].startswith("\n") else row["TITLE"], axis=1)
        df.loc[df.TITLE.str[:2] == "\n", "TITLE"] = df["CONTENT"]
    
    # Concat with init
    df = pd.concat([df, df_init], axis=0)
    if len(df) > 0:
        # Drop duplicates
        df = df.drop_duplicates("CONTENT_URL", keep='first')

        # Sort values
        df["SCENARIO_ORDER"] = pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime("%Y%W")
        df = df.sort_values(by=["SCENARIO_ORDER", "ENTITY", "PUBLISHED_DATE"], ascending=[False, True, False])
    return df.reset_index(drop=True)
    
df_content = create_db(df_posts, df_init, entity_name)
print("- Post db:", len(df_content))
df_content.head(len(df_posts))

## Output

### Save data

In [None]:
pdump(output_dir, df_content, file_content)

### Send "Content" to Google Sheets spreadsheet

In [None]:
send_data_to_gsheet(df_content, df_init, spreadsheet_url, sheet_name)