<img width="8%" alt="Google Sheets.png" src="https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/.github/assets/logos/Google%20Sheets.png" style="border-radius: 15%">

# Google Sheets - Update interaction database

**Tags:** #googlesheets #gsheet #data #naas_drivers #growth-engine #automation #picke #linkedin #interactions #comments #likes

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel/)

**Description:** This notebook updates interaction database with new interactions from likes and comments.

## Input

### Import libraries

In [None]:
from naas_drivers import gsheet
import pandas as pd
import os
from datetime import date
import naas_data_product

### Setup variables
**Inputs**
- `input_dir`: Input directory to retrieve file from.
- `file_reactions`: Name of the file with reactions to be retrieved.
- `file_comments`: Name of the file with comments to be retrieved.

**Outputs**
- `output_dir`: Output directory to save file to.
- `output_file`: Output file name to save as picke.
- `spreadsheet_url`: Google Sheets spreadsheet URL.
- `sheet_name`: Google Sheets sheet name.

In [None]:
# Inputs
input_dir = os.path.join(naas_data_product.OUTPUTS_PATH, "growth-engine", date.today().isoformat())
file_reactions = "linkedin_post_reactions"
file_comments = "linkedin_post_comments"
datalake_dir = naas_data_product.OUTPUTS_PATH

# Outputs
output_dir = os.path.join(naas_data_product.OUTPUTS_PATH, "growth-engine", date.today().isoformat())
output_file = "linkedin_interactions"
spreadsheet_url = naas.secret.get("ABI_SPREADSHEET")
sheet_name = "INTERACTIONS"

## Model

### Get interactions

In [None]:
df_init = gsheet.connect(spreadsheet_url).get(sheet_name=sheet_name)
if not isinstance(df_init, pd.DataFrame):
    df_init = pd.DataFrame()
print("🗂️ Interactions (init):", len(df_init))
# df_gsheet.head(1)

### Get reactions
We can not have a precise date of a reaction. Therefore, our approach is to initially assign the reaction date as the same date as the content's published. However, since we update our database on a daily basis, we can capture new interactions on a daily basis as well. In such cases, we assign the date of the extraction as the reaction date, allowing us to accurately track and record the timing of these interactions.

In [None]:
# Get reaction files
reactions_files = sorted(glob.glob(os.path.join(datalake_dir, "growth-engine", "**", f"{file_reactions}.pickle"), recursive=True))

# Loop in files
limit = (datetime.now(TIMEZONE) - timedelta(days=datetime.now(TIMEZONE).weekday() + 7)).date() # Limit date on the 2 weeks
df_reactions = pd.DataFrame()
posts_url = []
for index, file in enumerate(reactions_files):
    input_dir_r = file.split(file_reactions)[0]
    date_dir = datetime.strptime(file.split("/")[-2], "%Y-%m-%d").replace(tzinfo=pytz.timezone('Europe/Paris')).date()
    if date_dir >= limit:
        tmp_df_reactions = pload(input_dir_r, file_reactions)
        if tmp_df_reactions is not None:
            tmp_posts_url = tmp_df_reactions["POST_URL"].unique().tolist()
            for x in tmp_posts_url:
                if x not in posts_url:
                    # Histo
                    if date_dir < date(2024, 5 , 1):
                        tmp_df_reactions["DATE_REACTION"] = pd.to_datetime(tmp_df_reactions['PUBLISHED_DATE'], format='%Y-%m-%d %H:%M:%S%z').dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z")
                    else:
                        tmp_df_reactions["DATE_REACTION"] = tmp_df_reactions['PUBLISHED_DATE']
                    posts_url.append(x)
                else:
                    tmp_df_reactions["DATE_REACTION"] = pd.to_datetime(tmp_df_reactions['DATE_EXTRACT'], format='%Y-%m-%d %H:%M:%S').dt.tz_localize(pytz.timezone("Europe/Paris")).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z")
            df_reactions = pd.concat([df_reactions, tmp_df_reactions])
        
if len(df_reactions) > 0:
    df_reactions = df_reactions.drop_duplicates(["PROFILE_URL", "POST_URL"], keep="first").reset_index(drop=True)
    
print('👍 Total Reactions:', len(df_reactions))
df_reactions.head(1)

### Get comments

In [None]:
df_comments = pload(input_dir, file_comments)
print('🗨️ Total Comments:', len(df_comments))
df_comments.head(1)

### Cleaning data

In [None]:
def create_interactions_dataset(
    df_gsheet,
    df_reactions,
    df_comments,
):
    # Init
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    
    if len(df_reactions) > 0:
        # Df reactions
        data_reaction = {
            "ENTITY": df_reactions["ENTITY"],
            "SCENARIO": df_reactions["SCENARIO"],
            "PLATFORM": "LinkedIn",
            "FIRSTNAME": df_reactions["FIRSTNAME"],
            "LASTNAME": df_reactions["LASTNAME"],
            "FULLNAME": df_reactions["FULLNAME"],
            "OCCUPATION": df_reactions["OCCUPATION"],
            "INTERACTION": "POST_REACTION",
            "INTERACTION_CONTENT": df_reactions["REACTION_TYPE"],
            "INTERACTION_SCORE": 1,
            "PROFILE_URL": df_reactions["PROFILE_URL"],
            "PUBLIC_ID": df_reactions["PUBLIC_ID"],
            "CONTENT_TITLE": df_reactions["TITLE"],
            "CONTENT_URL": df_reactions["POST_URL"],
            "PUBLISHED_DATE": df_reactions['PUBLISHED_DATE'],
            "DATE_INTERACTION": df_reactions["DATE_REACTION"],
            "DATE_EXTRACT": pd.to_datetime(df_reactions['DATE_EXTRACT']).dt.tz_localize(pytz.timezone("Europe/Paris")).dt.strftime("%Y-%m-%d %H:%M:%S%z"),
        }
        df1 = pd.DataFrame(data_reaction)
        
    if len(df_comments) > 0:
        # Df comments
        data_comment = {
            "ENTITY": df_comments["ENTITY"],
            "SCENARIO": df_comments["SCENARIO"],
            "PLATFORM": "LinkedIn",
            "FIRSTNAME": df_comments["FIRSTNAME"],
            "LASTNAME": df_comments["LASTNAME"],
            "FULLNAME": df_comments["FULLNAME"],
            "OCCUPATION": df_comments["OCCUPATION"],
            "INTERACTION": "POST_COMMENT",
            "INTERACTION_CONTENT": df_comments["TEXT"],
            "INTERACTION_SCORE": 3,
            "PROFILE_URL": df_comments["PROFILE_URL"],
            "PUBLIC_ID": df_comments["PUBLIC_ID"],
            "CONTENT_TITLE": df_comments["TITLE"],
            "CONTENT_URL": df_comments["CONTENT_URL"],
            "PUBLISHED_DATE": df_comments['PUBLISHED_DATE'],
            "DATE_INTERACTION": pd.to_datetime(df_comments['CREATED_TIME']).dt.tz_localize(pytz.timezone("Europe/Paris")).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z"),
            "DATE_EXTRACT": pd.to_datetime(df_comments['DATE_EXTRACT']).dt.tz_localize(pytz.timezone("Europe/Paris")).dt.strftime("%Y-%m-%d %H:%M:%S%z"),
        }
        df2 = pd.DataFrame(data_comment)
    
    # Concat df
    df = pd.concat([df1, df2]).reset_index(drop=True)
    df.insert(loc=2, column="DATE", value=pd.to_datetime(df['DATE_INTERACTION'], format="%Y-%m-%d %H:%M:%S%z").dt.strftime("%a. %d %b."))
    
    # Exclude Entity from Full name
    if len(df) > 0:
        entity = df.loc[0 , "ENTITY"]
        df = df[df["FULLNAME"] != entity]
        
    # Drop duplicates
    drop_duplicates = [
        "ENTITY",
        "SCENARIO",
        "PROFILE_URL",
        "INTERACTION_CONTENT",
        "CONTENT_URL"
    ]
    df = pd.concat([df, df_gsheet]).drop_duplicates(drop_duplicates).reset_index(drop=True)
    df["ORDER"] = pd.to_datetime(df['DATE_INTERACTION'], format="%Y-%m-%d %H:%M:%S%z").dt.strftime("%Y%m%d%H%M%S")
    df["DATE_EXTRACT"] = pd.to_datetime(df['DATE_EXTRACT']).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z")
    
    # Sort values
    df = df.sort_values(by=["ORDER", "FULLNAME"], ascending=[False, True])
    return df.reset_index(drop=True)

df_interactions = create_interactions_dataset(
    df_init,
    df_reactions,
    df_comments,
)
print('🗂️ Interactions:', len(df_interactions))
df_interactions.head(3)

## Output

### Save data

In [None]:
pdump(output_dir, df_interactions, output_file)

### Send data to Google Sheets spreadsheet

In [None]:
df_check = pd.concat([df_init.astype(str), df_interactions.astype(str)]).drop_duplicates(keep=False)
if len(df_check) > 0:
    gsheet.connect(spreadsheet_url).send(sheet_name=sheet_name, data=df_interactions, append=False)
else:
    print("Noting to update in Google Sheets!")