In [1]:
import pandas as pd
import numpy as np

# Prepare Labeling
We're going to get ~2500 data with ~1250 of organization and the replies.  
For the organization data we're going to get the most replied and every tweet by organization we're going to search the most liked reply.
Also, we're going to create the labeling datasets stepwise operation.
There' will be 5 batch of labeling datasets.

In [3]:
data_csv = ["CNNIndonesia.csv", "detikcom.csv", "tvOneNews.csv", "VIVAcoid.csv"]
NUM_ORGANIZTION = len(data_csv)
NUM_TARGET_DATA = 1250
NUM_STEP = 6
NUM_TWEET_ORGANIZATION = int(np.ceil((1250 / NUM_ORGANIZTION) / NUM_STEP)) 

relative_path_data = "../../data/2. Filtered/"
organization_path = f"{relative_path_data}/organization"
replies_path = f"{relative_path_data}/replies"
relative_path_data_labeling = "../../data/3. Labeling Ready/"

organizaton_df = {data: pd.read_csv(f"{organization_path}/{data}")\
                          .sort_values("reply_count", ascending=False).reset_index(drop=True)
                 for data in data_csv}
replies_df = {data: pd.read_csv(f"{replies_path}/{data}") \
                      .sort_values("like_count", ascending=False)\
                      .set_index("tweet_conversation_id", drop=False)
              for data in data_csv}

print(f"We're going to get {NUM_TWEET_ORGANIZATION} tweets for every organization and reply for every step")

We're going to get 53 tweets for every organization and reply for every step


In [4]:
# Build Labeling data
# A minimum character for a reply. If there's no reply that have met the min char go get the first one  most likedf_
MIN_CHAR = 100
columns_removed = ["reply_count", "like_count", "tweet_url"]

# We want to have sample for the first one (total data: 100 on every organization)
# If NUM_TWEET_ORGANIZATION has 53 on 1 organization and our target is 100 on every organization
# We should divide X / 2
NUM_SAMPLE_TWEET_ORGANIZATION = NUM_TWEET_ORGANIZATION // 2

for step in range(1, NUM_STEP + 1):
    start_index = NUM_SAMPLE_TWEET_ORGANIZATION + NUM_TWEET_ORGANIZATION * (step - 1)
    end_index = NUM_SAMPLE_TWEET_ORGANIZATION + NUM_TWEET_ORGANIZATION * step

    if step == 1:
        start_index = 0
        end_index = NUM_SAMPLE_TWEET_ORGANIZATION

    organization_labeling_df = {}
    reply_labeling_df = {}
    for data_name in data_csv:
        current_organization = organizaton_df[data_name].copy()
        current_replies = replies_df[data_name].copy()

        # Get the headline data
        current_organization = current_organization[start_index:end_index]

        # Get the reply data by matching tweet_id and conversation_id
        conversation_ids = current_organization["tweet_id"].values
        replies = []
        for conversation_id in conversation_ids:
            all_replies = current_replies.loc[conversation_id]

            # Only got one reply
            if isinstance(all_replies, pd.Series):
                replies.append(all_replies)
                continue

            # Got multiple reply
            if isinstance(all_replies, pd.DataFrame):
                appended_reply = all_replies.iloc[0]

                min_char_met = False
                for i in range(len(all_replies)):
                    current_reply = all_replies.iloc[i]
                    if len(current_reply["tweet"]) > MIN_CHAR:
                        appended_reply = current_reply
                        min_char_met = True
                        break

                if not min_char_met:
                    print(
                        f"Conversation id {conversation_id} has no met min char reply"
                    )

                replies.append(all_replies.iloc[0])
                continue

            print(f"No Reply detected on conversation_id {conversation_id}")

        organization_labeling_df[data_name] = current_organization.copy()
        reply_labeling_df[data_name] = pd.DataFrame(replies).reset_index(drop=True)

    # Save data
    # Combine for labeling
    df_organization = []
    df_replies = []
    for i in organization_labeling_df:
        print(f"org {len(organization_labeling_df[i])} rep {len(reply_labeling_df[i])}")
        df_organization.append(organization_labeling_df[i])
        df_replies.append(reply_labeling_df[i])

    concat_org = pd.concat(df_organization)
    concat_org["labels (Non-Headline 0 / Headline 1)"] = np.ones(len(concat_org)) * -1
    concat_rep = pd.concat(df_replies)
    concat_rep["labels (Non-Headline 0 / Headline 1)"] = np.ones(len(concat_org)) * -1

    concat_org = concat_org.drop(columns=columns_removed)
    concat_rep = concat_rep.drop(columns=columns_removed)

    with pd.ExcelWriter(
        f"{relative_path_data_labeling}/organization_{step}.xlsx", engine="openpyxl"
    ) as writer:
        concat_org.to_excel(writer, sheet_name="Organization Sheet", index=False)

    with pd.ExcelWriter(
        f"{relative_path_data_labeling}/replies_{MIN_CHAR}_{step}.xlsx",
        engine="openpyxl",
    ) as writer:
        concat_rep.to_excel(writer, sheet_name="Replies Sheet", index=False)


Conversation id 1584812873498431488 has no met min char reply
org 13 rep 13
org 13 rep 13
org 13 rep 13
org 13 rep 13
Conversation id 1594570082688307200 has no met min char reply
Conversation id 1520691882736578560 has no met min char reply
org 53 rep 53
org 53 rep 53
org 53 rep 53
org 53 rep 53
Conversation id 1596357377241980928 has no met min char reply
Conversation id 1585932086648741890 has no met min char reply
Conversation id 1573728607855583232 has no met min char reply
Conversation id 1603350165657890818 has no met min char reply
Conversation id 1584018158532583425 has no met min char reply
org 53 rep 53
org 53 rep 53
org 53 rep 53
org 53 rep 53
Conversation id 1577226192608198656 has no met min char reply
Conversation id 1587611213454008322 has no met min char reply
Conversation id 1567703786650738688 has no met min char reply
Conversation id 1584205646098235394 has no met min char reply
Conversation id 1601079346793689093 has no met min char reply
org 53 rep 53
org 53 rep 5