<img width="8%" alt="Growth" src="https://naasai-public.s3.eu-west-3.amazonaws.com/abi-demo/growth_marketing.png" style="border-radius: 15%">

# Growth - Create INTERACTIONS database

**Tags:** #growth #googlesheets #gsheet #data #naas_drivers #growth-engine #automation #picke #linkedin #interactions #comments #likes

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel/)

**Description:** This notebook updates the INTERACTIONS database with new interactions from likes and comments.

## Input

### Import libraries

In [None]:
from naas_drivers import gsheet
import pandas as pd
import os
from datetime import date
import naas_data_product
from pytz.exceptions import NonExistentTimeError

### Setup variables
**Inputs**
- `entity_index`: Entity index.
- `entity_dir`: Entity directory.
- `input_dir`: Input directory to retrieve file from.
- `file_reactions`: Name of the file with reactions to be retrieved.
- `file_comments`: Name of the file with comments to be retrieved.
- `days_start`: Number of day to start from the beginning of the current week.

**Outputs**
- `output_dir`: Output directory to save file to.
- `output_file`: Output file name to save as picke.
- `spreadsheet_url`: Google Sheets spreadsheet URL.
- `sheet_interaction`: Google Sheets sheet name.

In [None]:
# Inputs
entity_index =  "0"
entity_dir = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entities", entity_index), "entity_dir")
input_dir = os.path.join(entity_dir, "growth-engine", date.today().isoformat())
file_reactions = "linkedin_post_reactions"
file_comments = "linkedin_post_comments"
days_start = None
api_key = naas.secret.get('NAAS_API_TOKEN')
sheet_posts = "POSTS"

# Outputs
output_dir = os.path.join(entity_dir, "growth-engine", date.today().isoformat())
output_file = "interactions"
spreadsheet_url = pload(os.path.join(naas_data_product.OUTPUTS_PATH, "entities", entity_index), "abi_spreadsheet")
sheet_interaction = "INTERACTIONS"

## Model

### Get interactions

In [None]:
df_init = gsheet.connect(spreadsheet_url).get(sheet_name=sheet_interaction)
if not isinstance(df_init, pd.DataFrame):
    df_init = pd.DataFrame()
print("🗂️ Interactions (init):", len(df_init))
df_init.head(1)

### Get posts

In [None]:
df_posts = gsheet.connect(spreadsheet_url).get(sheet_name=sheet_posts)
if not isinstance(df_posts, pd.DataFrame):
    df_posts = pd.DataFrame()
print("- Posts db (init):", len(df_posts))
df_posts.head(1)

### Get reactions
We can not have a precise date of a reaction. Therefore, our approach is to initially assign the reaction date as the same date as the content's published. However, since we update our database on a daily basis, we can capture new interactions on a daily basis as well. In such cases, we assign the date of the extraction as the reaction date, allowing us to accurately track and record the timing of these interactions.

In [None]:
def get_reactions(
    entity_dir,
    file_name,
    days_start=None
):
    # Init
    df = pd.DataFrame()
    files = sorted(glob.glob(os.path.join(entity_dir, "growth-engine", "**", f"{file_name}.pickle"), recursive=True), reverse=True) # Get reaction files
    print(f"📁 Files: {len(files)}")
    
    # Determine limit date
    date_limit = datetime.now().date()
    if len(files) > 0:
        date_limit = datetime.strptime(files[-1].split("/")[-2], "%Y-%m-%d").replace(tzinfo=pytz.timezone('Europe/Paris')).date()
    if isinstance(days_start, int):
        date_limit = (datetime.now(TIMEZONE) - timedelta(days=datetime.now(TIMEZONE).weekday() - days_start)).date() # Limit date on the 2 weeks
    print(f"⚠️ Limit Date: {date_limit}")
    
    # Loop in files    
    posts_url = []
    for index, file in enumerate(files):
        date_dir = datetime.strptime(file.split("/")[-2], "%Y-%m-%d").replace(tzinfo=pytz.timezone('Europe/Paris')).date()
        if date_dir < date_limit:
            break
            
        print(f"{index+1}- File: {file}")
        input_dir_r = file.split(file_name)[0]
        tmp_df = pload(input_dir_r, file_name)
        if tmp_df is not None and "POST_URL" in tmp_df.columns:
            tmp_posts_url = tmp_df["POST_URL"].unique().tolist()
            for x in tmp_posts_url:
                if x not in posts_url:
                    tmp_df["DATE_REACTION"] = tmp_df['PUBLISHED_DATE']
                    posts_url.append(x)
                else:
                    tmp_df["DATE_REACTION"] = pd.to_datetime(tmp_df['DATE_EXTRACT'], format='%Y-%m-%d %H:%M:%S').dt.tz_localize(pytz.timezone("Europe/Paris")).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z")
            df = pd.concat([df, tmp_df])
    if len(df) > 0:
        df = df.drop_duplicates(["PROFILE_URL", "POST_URL"], keep="first")
    return df.reset_index(drop=True)

df_reactions = get_reactions(entity_dir, file_reactions, days_start)
print('👍 Total Reactions:', len(df_reactions))
df_reactions.head(1)

### Get comments

In [None]:
def get_comments(
    entity_dir,
    file_name,
    days_start=None
):
    # Init
    df = pd.DataFrame()
    files = sorted(glob.glob(os.path.join(entity_dir, "growth-engine", "**", f"{file_name}.pickle"), recursive=True), reverse=True) # Get reaction files
    print(f"📁 Files: {len(files)}")
    
    # Determine limit date
    date_limit = datetime.now().date()
    if len(files) > 0:
        date_limit = datetime.strptime(files[-1].split("/")[-2], "%Y-%m-%d").replace(tzinfo=pytz.timezone('Europe/Paris')).date()
    if isinstance(days_start, int):
        date_limit = (datetime.now(TIMEZONE) - timedelta(days=datetime.now(TIMEZONE).weekday() - days_start)).date() # Limit date on the 2 weeks
    print(f"⚠️ Limit Date: {date_limit}")
    
    # Loop in files
    for index, file in enumerate(files):
        date_dir = datetime.strptime(file.split("/")[-2], "%Y-%m-%d").replace(tzinfo=pytz.timezone('Europe/Paris')).date()
        if date_dir < date_limit:
            break
        
        print(f"{index+1}- File: {file}")
        input_dir_r = file.split(file_name)[0]
        tmp_df = pload(input_dir_r, file_name)
        df = pd.concat([df, tmp_df])
    if len(df) > 0:
        df = df.drop_duplicates(["PROFILE_URL", "POST_URL"], keep="first")
    return df.reset_index(drop=True)

df_comments = get_comments(entity_dir, file_comments, days_start)
print('🗨️ Total Comments:', len(df_comments))
df_comments.head(1)

### Cleaning data

In [None]:
def handle_time_error(df_init, column):
    # Handle NonExistentTimeError
    df = df_init.copy()
    for i in range(len(df[column])):
        try:
            actual_time = pd.to_datetime(df.loc[i, column]).tz_localize(pytz.timezone("Europe/Paris"))
        except NonExistentTimeError:
            actual_time = str(pd.to_datetime(df.loc[i, column]) + pd.DateOffset(hours=1))
            df.loc[i, column] = actual_time       
    return df

def create_interactions_dataset(
    df_gsheet,
    df_reactions,
    df_comments,
    output_dir,
):
    # Init
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    
    # Handle NonExistentTimeError
    df_comments = handle_time_error(df_comments, "CREATED_TIME")
    
    if len(df_reactions) > 0:
        # Df reactions
        data_reaction = {
            "ENTITY": df_reactions["ENTITY"],
            "SCENARIO": df_reactions["SCENARIO"],
            "SOURCE": "LinkedIn",
            "INTERACTION_DATE": df_reactions["DATE_REACTION"],
            "TYPE": "POST_REACTION",
            "CONTENT": df_reactions["REACTION_TYPE"],
            "SENTIMENT": "NA",
            "SCORE": 1,
            "COMMENT_LANGUAGE": "NA",
            "COMMENT_COMMENTS_COUNT": 0,
            "COMMENT_LIKES_COUNT": 0,
            "PROFILE_ID": df_reactions.apply(lambda row: get_linkedin_id_from_url(row["PROFILE_URL"]), axis=1),
            "FIRSTNAME": df_reactions["FIRSTNAME"],
            "LASTNAME": df_reactions["LASTNAME"],
            "FULLNAME": df_reactions["FULLNAME"],
            "OCCUPATION": df_reactions["OCCUPATION"],
            "PROFILE_URL": df_reactions["PROFILE_URL"],
            "PUBLIC_ID": df_reactions["PUBLIC_ID"],
            "CONTENT_TITLE": df_reactions["TITLE"],
            "CONTENT_URL": df_reactions["POST_URL"],
            "CONTENT_ID": df_reactions.apply(lambda row: create_sha_256_hash(str(row["POST_URL"].split(":activity:")[1].split("/")[0])), axis=1),
            "PUBLISHED_DATE": df_reactions['PUBLISHED_DATE'],
            "DATE_EXTRACT": pd.to_datetime(df_reactions['DATE_EXTRACT']).dt.tz_localize(pytz.timezone("Europe/Paris")).dt.strftime("%Y-%m-%d %H:%M:%S%z"),
        }
        df1 = pd.DataFrame(data_reaction)
        
    if len(df_comments) > 0:
        # Df comments
        data_comment = {
            "ENTITY": df_comments["ENTITY"],
            "SCENARIO": df_comments["SCENARIO"],
            "SOURCE": "LinkedIn",
            "INTERACTION_DATE": pd.to_datetime(df_comments['CREATED_TIME']).dt.tz_localize(pytz.timezone("Europe/Paris")).dt.tz_convert(TIMEZONE).dt.strftime("%Y-%m-%d %H:%M:%S%z"),
            "TYPE": "POST_COMMENT",
            "CONTENT": df_comments["TEXT"],
            "SENTIMENT": "TBD",
            "SCORE": 3,
            "COMMENT_COMMENTS_COUNT": df_comments["COMMENTS"],
            "COMMENT_LIKES_COUNT": df_comments["LIKES"],
            "COMMENT_LANGUAGE": df_comments["LANGUAGE"],
            "PROFILE_ID": df_comments.apply(lambda row: get_linkedin_id_from_url(row["PROFILE_URL"]), axis=1),
            "FIRSTNAME": df_comments["FIRSTNAME"],
            "LASTNAME": df_comments["LASTNAME"],
            "FULLNAME": df_comments["FULLNAME"],
            "OCCUPATION": df_comments["OCCUPATION"],
            "PROFILE_URL": df_comments["PROFILE_URL"],
            "PUBLIC_ID": df_comments["PUBLIC_ID"],
            "CONTENT_TITLE": df_comments["TITLE"],
            "CONTENT_URL": df_comments["CONTENT_URL"],
            "CONTENT_ID": df_comments.apply(lambda row: create_sha_256_hash(str(row["POST_URL"].split(":activity:")[1].split("/")[0])), axis=1),
            "PUBLISHED_DATE": df_comments['PUBLISHED_DATE'],
            "DATE_EXTRACT": pd.to_datetime(df_comments['DATE_EXTRACT']).dt.tz_localize(pytz.timezone("Europe/Paris")).dt.strftime("%Y-%m-%d %H:%M:%S%z"),
        }
        df2 = pd.DataFrame(data_comment)
    
    # Concat df
    df = pd.concat([df1, df2]).reset_index(drop=True)
    if len(df) > 0:
        # Add date
        df.insert(loc=4, column="DATE", value=pd.to_datetime(df['INTERACTION_DATE'].str[:19], format="%Y-%m-%d %H:%M:%S").dt.strftime("%a. %d %b."))
        df.insert(loc=5, column="ID", value=df.apply(lambda row: create_sha_256_hash(row["INTERACTION_DATE"] + row["PROFILE_ID"] + row["CONTENT_ID"] + row["CONTENT"]), axis=1))
        
    
    # Histo abi version < 1.14.0
    df_gsheet["CONTENT_ID"] = df_gsheet.apply(lambda row: create_sha_256_hash(str(row["CONTENT_URL"].split(":activity:")[1].split("/")[0])), axis=1)
    to_rename = {
        "DATE_INTERACTION": "INTERACTION_DATE",
        "INTERACTION": "TYPE",
        "INTERACTION_SCORE": "SCORE",
        "INTERACTION_CONTENT": "CONTENT",
        "COMMENT_SENTIMENT": "SENTIMENT",
    }
    df_gsheet = df_gsheet.rename(columns=to_rename)
    to_add = {
        "COMMENT_COMMENTS_COUNT": 0,
        "COMMENT_LIKES_COUNT": 0,
        "COMMENT_LANGUAGE": "NA",
        "SENTIMENT": "NA",
        "CONTENT_ID": df_gsheet.apply(lambda row: create_sha_256_hash(str(row["CONTENT_URL"].split(":activity:")[1].split("/")[0])), axis=1),
        "PROFILE_ID": df_gsheet.apply(lambda row: get_linkedin_id_from_url(row["PROFILE_URL"]), axis=1),
    }
    for k, v in to_add.items():
        if k not in df_gsheet.columns:
            df_gsheet[k] = v
            if k == "SENTIMENT":
                df_gsheet[k] = df_gsheet[k].astype(str).replace("None", "NA")
                df_gsheet.loc[df_gsheet["TYPE"] == "POST_COMMENT", k] = "TBD"
            elif k in ["COMMENT_COMMENTS_COUNT", "COMMENT_LIKES_COUNT"]:
                df_gsheet[k] = df_gsheet[k].astype(str).replace("None", "0").astype(int)
            else:
                df_gsheet[k] = df_gsheet[k].astype(str).replace("None", "NA")
    df_gsheet["ID"] = df_gsheet.apply(lambda row: create_sha_256_hash(str(row["INTERACTION_DATE"]) + str(row["PROFILE_ID"]) + str(row["CONTENT_ID"]) + str(row["CONTENT"])), axis=1)
                
    # Concat dfs
    df = pd.concat([df, df_gsheet])
    if len(df) > 0:
        # Update sentiment
        sentiment = get_dict_from_df(df, "SENTIMENT", "ID", "comment_sentiment", output_dir)
        df["SENTIMENT"] = df["ID"].map(sentiment).fillna("TBD")
        
        # Drop duplicates
        df = df.drop_duplicates(["ID"]).reset_index(drop=True)

        # Sort values
        df = df.sort_values(by=["INTERACTION_DATE", "FULLNAME"], ascending=[False, True])
    return df.reset_index(drop=True)

db_interactions = create_interactions_dataset(
    df_init,
    df_reactions,
    df_comments,
    output_dir,
)
print('🗂️ Interactions:', len(db_interactions))
db_interactions.head(3)

### Enrich Interactions with comments "Sentiment"

In [None]:
def enrich_content(
    df_init,
    df_posts,
    api_key,
    output_dir,
):
    # Init
    df = df_init.copy()
    
    # Filter data
    filter_df = df[
        (df["SENTIMENT"].isin(["TBD"]))
    ]
    print("-> Comment to be updated:", len(filter_df))
    
    # Get Sentiment
    content_sentiment = get_dict_from_df(df, "SENTIMENT", "ID", "comment_sentiment", output_dir)


    sentiment_definition = """
    Sentiment represents the emotional tone or attitude expressed in a content or in a piece of content to understand the feelings or opinions towards a particular subject.
    It could be:
    - "Praise": Highly positive that expresses admiration or approval. This sentiment often includes compliments or positive feedback.
    - "Supportive": Positive that may not necessarily contain high praise but show agreement, support, or encouragement.
    - "Neutral": Neither positive nor negative, often factual statements or questions without any clear positive or negative connotations.
    - "Constructive": May seem negative but are intended to provide constructive feedback or suggest improvements.
    - "Disapproving": Express disagreement, criticism, or negative feedback.
    Identify as many as possible from the list above
    """

    system_msg = "You are a helpful IT-project and account management expert who extracts information from documents. In this case from comments made in LinkedIn post"
    content_prompt = """
    From the COMMENT below, extract the "Sentiment" entity. Use the POST text to get more context about the comment.
    0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
    1. Look for Sentiment "Praise", "Supportive", "Neutral", "Constructive", "Disapproving" in the COMMENT and generate as comma-separated format similar to entity type.
        Do not create new entity types that aren't mentioned below.
        Entity Types:
        label:'Sentiment',name:string;summary:string //[sentiment_definition]

    2. The output should look like :
    {
        "entities": [{"label":"Sentiment","name":string,"summary":string}],
    }
    POST:
    [post]
    
    COMMENT:
    [comment]
    """
    content_prompt = content_prompt.replace("[sentiment_definition]", sentiment_definition)

    # Loop on profile
    count = 1
    for row in filter_df.itertuples():
        # Init values
        index = row.Index
        uid = row.ID
        content_id = row.CONTENT_ID
        content_title = row.CONTENT_TITLE
        people = row.FULLNAME
        post = df_posts.loc[df_posts["ID"] == content_id, "TEXT"].values[0]
        comment = row.CONTENT
        
        # Replace value in prompt
        prompt_msg = content_prompt
        prompt_msg = prompt_msg.replace("[post]", post)
        prompt_msg = prompt_msg.replace("[comment]", comment)

        # Function to call the Naas Chat API
        print(f"🤖 Extracting Sentiment '{comment}' made on  '{content_title}' by '{people}'")
        sentiment = []
        try:
            result = create_naas_chat_completion(
                api_key,
                prompt=system_msg,
                message=prompt_msg,
            )
            res_json = json.loads(result)
            pdump(output_dir, res_json, f"kgd_comment_{uid}")
            entities = res_json.get("entities")
            for e in entities:
                label = e.get("label")
                name = e.get("name")
                summary = e.get("summary")
                print(f'- {label}\n{name}: {summary}')
                sentiment.append(f"{name}: {summary}")
        except Exception as e:
            print(e)
        df.loc[index, "SENTIMENT"] = "|".join(sentiment) if len(sentiment) > 0 else "NA"
        print()
    return df.reset_index(drop=True)
    
df_interactions = enrich_content(
    db_interactions,
    df_posts,
    api_key,
    output_dir,
)
df_interactions.head(1)

## Output

### Save data

In [None]:
pdump(output_dir, df_interactions, output_file)

### Send data to Google Sheets spreadsheet

In [None]:
send_data_to_gsheet(df_interactions, df_init, spreadsheet_url, sheet_interaction)