<img width="8%" alt="Google Sheets.png" src="https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/.github/assets/logos/Google%20Sheets.png" style="border-radius: 15%">

# Google Sheets - Send content database to spreadsheet

**Tags:** #googlesheets #gsheet #data #naas_drivers #operations #snippet

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel/)

**Description:** This notebook streamlines the process of getting your content published, enhancing it with topics, and sending it to a Google Sheets spreadsheet.

## Input

### Import libraries

In [None]:
from naas_drivers import gsheet
import pandas as pd
import os
from datetime import date
import naas_data_product

### Setup variables
**Inputs**
- `input_dir`: Input directory to retrieve file from.
- `file_name`: Name of the file to be retrieved.
- `openai_api_key`: OpenAI API Key.

**Outputs**
- `spreadsheet_url`: Google Sheets spreadsheet URL.
- `sheet_name`: Google Sheets sheet name.
- `output_dir`: Output directory
- `file_content`: Name of the file to be saved in your local.

In [None]:
# Inputs
input_dir = os.path.join(naas_data_product.OUTPUTS_PATH, "content-engine", date.today().isoformat())
file_name = "linkedin_posts"
openai_api_key = naas.secret.get("OPENAI_API_KEY") or "YOUR_OPENAI_API_KEY"

# Outputs
spreadsheet_url = naas.secret.get("ABI_SPREADSHEET") or "YOUR_GOOGLE_SPREADSHEET_URL"
sheet_name = "CONTENT"
output_dir = os.path.join(naas_data_product.OUTPUTS_PATH, "content-engine", date.today().isoformat())
file_content = "content"

## Model

### Get content

In [None]:
df_init = gsheet.connect(spreadsheet_url).get(sheet_name=sheet_name)
if not isinstance(df_init, pd.DataFrame):
    df_init = pd.DataFrame()
print("- Content db (init):", len(df_init))
# df_init.head(1)

### Get posts

In [None]:
df_posts = pload(input_dir, file_name)    
print("- New content published:", len(df_posts))
df_posts.head(len(df_posts))

### Cleaning data

In [None]:
def create_db(
    df_new,
    df_init,
):
    # Init
    df = df_new.copy()
    
    # Format published date
    df["PUBLISHED_DATE"] = pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.tz_localize(pytz.timezone("Europe/Paris")).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z")
    df["DATE_EXTRACT"] = pd.to_datetime(df['DATE_EXTRACT'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.tz_localize(pytz.timezone("Europe/Paris")).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z")

    # Cleaning: if title is None and Content = 'Video (native)' -> "Live"
    df.loc[(df["TITLE"].astype(str) == 'None') & (df["CONTENT"] == 'Video (native)'), "TITLE"] = "Live"
    df.loc[df["TITLE"].astype(str) == 'Live', "TEXT"] = "Live"
    
    # Cleaning: rename columns + None to NA
    to_rename = {
        "POST_URL": "CONTENT_URL",
        "AUTHOR_NAME": "ENTITY",
        "TEXT": "CONTENT",
        "CHARACTER_COUNT": "CONTENT_LENGTH",
        "TAGS": "KEYWORDS",
    }
    df = df.drop(["CONTENT_URL", "CONTENT"], axis=1).rename(columns=to_rename)
    df.KEYWORDS = df.KEYWORDS.astype(str).str.replace("None", "NA")

    # Select
    to_select = [
        "ENTITY",
        "PUBLISHED_DATE",
        "TITLE",
        "CONTENT",
        "CONTENT_LENGTH",
        "KEYWORDS",
        "VIEWS",
        "LIKES",
        "COMMENTS",
        "SHARES",
        "ENGAGEMENT_SCORE",
        "CONTENT_URL",
        "DATE_EXTRACT"
    ]
    df = df[to_select]
    df_input = df_init[to_select]
    
    # Concat with init
    df = pd.concat([df, df_input], axis=0)
    df = df.drop_duplicates("CONTENT_URL", keep='first')
    df = df.sort_values(by="PUBLISHED_DATE", ascending=False)

    # Add new data
    df.insert(loc=1, column="SCENARIO", value=pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime("W%W-%Y"))
    df.insert(loc=2, column="SOURCE", value="LinkedIn")
    df.insert(loc=4, column="DATE", value=pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime("%a. %d %b."))
    df.insert(loc=5, column="TIME", value=pd.to_datetime(df['PUBLISHED_DATE'].str[:19], format='%Y-%m-%d %H:%M:%S').dt.strftime('%HH%M'))
    return df.reset_index(drop=True)
    
df_content = create_db(df_posts, df_init)
print("- Content db:", len(df_content))
df_content.head(len(df_posts))

## Output

### Save data

In [None]:
pdump(output_dir, df_content, file_content)

### Send "Content" to Google Sheets spreadsheet

In [None]:
df_check = pd.concat([df_init.astype(str), df_content.astype(str)]).drop_duplicates(keep=False)
if len(df_check) > 0:
    gsheet.connect(spreadsheet_url).send(sheet_name=sheet_name, data=df_content, append=False)
else:
    print("Noting to update in Google Sheets!")