#  ⚙️ Data Processing

## 👀 View Data
We scraped all the data from ***Facebook*** and ***Instagram*** thanks to **CrowdTangle**, a social media analytics tool by Meta. The first thing to do in this cases is always ''look at the data!''.

**Scraping**

We wanted to analyze online conversations about two new emerging technologies: **ChatGPT** and **Apple Vision Pro**. The first technology breakthrough was analyzed in two different temporal windows: when ChatGPT 3.5 was launched (*30 Nov 2022*) and when ChatGPT 4 was launched (*14 Mar 2023*). Instead, Apple Vision Pro became available for purchase on *2 Feb 2024*.

* For all the three events, we scraped data within a range of **3 months**. Trying to have better and more precise results, the starting point was the target date anticipated by 5 days (e.g. if the target date is *30 Nov 2022*, we scrape data starting from *25 Nov 2022* to *25 Feb 2023*).

* In order to get the data the used *keywords* were {"openai", "chatgpt", "llm", "gpt, "gpt-3.5" and "gpt 3.5"} for ChatGPT 3.5, {"openai", "chatgpt", "llm", "gpt, "gpt-4" , "gpt 4"} for ChatGPT 4 and {"apple vision pro", "vision pro"} for Apple Vision Pro.

> We selected only English posts and for Facebook, we filter out posts from Facebook groups by keeping only the ones for Facebook pages (scaling down from 1M to 100K). <br> All the scraped data are in *data/raw* folder!


In [None]:
import pandas as pd
# see how many rows for each .csv files
platforms = ["fb", "ig"]
topics = ["gpt3", "gpt4", "apple"]

for platform in platforms:
    for topic in topics:
        print(f"{platform}_{topic}")
        print( len( pd.read_csv(f"../data/raw/{platform}_{topic}.csv") ) )
        

**RAW DATA**
<table>
  <tr>
    <td><center> </center></td>
    <th><center>GPT-3.5</center></th>
    <th><center>GPT-4</center></th>
    <th><center>Apple Vision Pro</center></th>
  </tr>
  <tr>
    <th><center>Facebook</center></th>
    <td><center>75811</center></td>
    <td><center>89348</center></td>
    <td><center>8668</center></td>
  </tr>
  <tr>
    <th><center>Instagram</center></th>
    <td><center>11071</center></td>
    <td><center>24718</center></td>
    <td><center>2858</center></td>
  </tr>
</table>

## 🧹 Clean Data

Data from Facebook have more fields than data from Instagram, but are quite similar in the structure. The most substantial difference between the two platforms resides in the type of *interactions* that users can have. In Instagram there are only *likes* and *comments*. In Facebook we have, in addition to those, *shares* and *reactions* (LOVE, HAHA, WOW, SAD, ANGRY and CARE).

here you decribe the design coices for the cleaning 
4. make sure you remove posts by account  Crypto PH from chatgpt fb posts. that's a spam account which used chatgpt related tags to get attention.

In [None]:
# define platforms and topics for the analysis
platforms = ["fb", "ig"]
topics = ["gpt3", "gpt4", "apple"]

# we select and rename in a unified manner columns of data frame
COLUMN_MAPPING = {
    "ig": {
        "User Name": "author_id",
        "Post Created Date": "date",
        "Total Interactions": "interaction",
        "URL": "id",
        "Description": "text_1",
        "Image Text": "text_2",
    },
    "fb": {
        "Facebook Id": "author_id",
        "Total Interactions": "interaction",
        "URL": "id",
        "Post Created Date": "date",
        "Message": "text_1",
        "Description": "text_2",
        "Link Text": "text_3",
        "Love" : "Love", "Wow":"Wow","Haha":"Haha", "Sad":"Sad", "Angry":"Angry", "Care":"Care" # reactions
    }, 
}
# we define the temporal windows from which we want to extract conversations
DATE_RANGE = {
    "gpt3": {"start": "2022-11-25", "end": "2023-02-25"},
    "gpt4": {"start": "2023-03-09", "end": "2023-06-09"},
    "apple": {"start": "2024-01-28", "end": "2024-04-28"},
}

In [None]:
from data_utils import merge_text, remove_facebook_spam, convert_date, filter_language, clean_text
import pandas as pd
from tqdm import tqdm

def load_data(raw_file_path, platform):
    """Load data from a given file path."""
    column_mapping = COLUMN_MAPPING[platform]
    data = pd.read_csv(raw_file_path, usecols=column_mapping.keys(), low_memory=False)
    data = data.rename(columns=column_mapping)
    print(f"Successfully loaded {platform} data from {raw_file_path}")
    return data

def clean_data(data, topic):
    """Transform the raw data."""
    
    # drop nan values for id, author_id, and date
    len_data = len(data)
    data = data.dropna(subset=["id", "author_id", "date"]).reset_index(drop=True)
    print(f"After dropping nan values we eliminated {len_data-len(data)} entries.")

    # drop duplicates
    len_data = len(data)
    data = data.drop_duplicates(subset=["id"]).reset_index(drop=True)
    print(f"After dropping duplicates we eliminated {len_data-len(data)} entries.")

    # select time range (if the scraping worked well, no data should be dropped)
    len_data = len(data)
    data["date"] = data["date"].apply(convert_date)
    data = data.dropna(subset=["date"])
    data = data.sort_values(by=["date"])
    start_date = convert_date(DATE_RANGE[topic]["start"])
    end_date = convert_date(DATE_RANGE[topic]["end"])
    data = data[(data["date"] >= start_date) & (data["date"] <= end_date)]
    print(f"Min date: {data['date'].min()}")
    print(f"Max date: {data['date'].max()}")
    print(f"After selecting time range we eliminated {len_data-len(data)} entries.")

    # merge text if there are multiple text fields (our case)
    data = merge_text(data)
    # cleaning text (not too aggressive)
    data.dropna(subset=["text"], inplace=True)
    data["text"] = data["text"].astype(str)
    
    tqdm.pandas()
    data["clean_text"] = data["text"].progress_apply(lambda x: clean_text(x))
    data.dropna(subset=["clean_text"], inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    # language filtering
    data = filter_language(data)
    
    return data

In [None]:
# run the cleaning process
for topic in topics:
    for platform in platforms:
        data = load_data(raw_file_path=f"../data/raw/{platform}_{topic}.csv", platform=platform)
        data = clean_data(data=data, topic=topic)
        if platform == "fb":
            len_data = len(data)
            data = remove_facebook_spam(data)
            print(f"After spam detection we eliminated {len_data-len(data)} entries.")
        data.to_csv(f"../data/clean/{platform}_{topic}.csv", index=False)
        print("--------------------------------")

In [1]:
import pandas as pd
# see how many rows for each .csv files
platforms = ["fb", "ig"]
topics = ["gpt3", "gpt4", "apple"]

for platform in platforms:
    for topic in topics:
        print(f"{platform}_{topic}")
        print( len( pd.read_csv(f"../data/clean/{platform}_{topic}.csv") ) )

fb_gpt3
72862
fb_gpt4
85498
fb_apple
8314
ig_gpt3
9738
ig_gpt4
21890
ig_apple
2459


**CLEAN DATA**
<table>
  <tr>
    <td><center> </center></td>
    <th><center>GPT-3.5</center></th>
    <th><center>GPT-4</center></th>
    <th><center>Apple Vision Pro</center></th>
  </tr>
  <tr>
    <th><center>Facebook</center></th>
    <td><center>75811 - <b>72862</b> (2949 less)</center></td>
    <td><center>89348 - <b>85498</b> (3850 less)</center></td>
    <td><center>8668 - <b>8314</b> (354 less)</center></td>
  </tr>
  <tr>
    <th><center>Instagram</center></th>
    <td><center>11071 - <b>9738</b> (1333 less)</center></td>
    <td><center>24718 - <b>21890</b> (2828 less)</center></td>
    <td><center>2858 - <b>2459</b> (399 less)</center></td>
  </tr>
</table>

## ⛔ Creation of training dataset for Sentiment/Emotion analysis with LLMs 

In [None]:
from datasets import load_dataset

emotion_dataset = load_dataset("cardiffnlp/super_tweeteval", "tweet_emotion")
sentiment_dataset = load_dataset("cardiffnlp/super_tweeteval", "tweet_sentiment")

In [None]:
# EMOTION

label2emotion = {0 : "anger", 1 : "anticipation", 2 : "disgust", 3 : "fear", 
                 4 : "joy", 5 : "love", 6 : "optimism", 7 : "pessimism", 8 : "sadness", 9 : "surprise", 10 : "trust"}

def manipulate_emotion_labels(label):
    ris = []
    for i,e in enumerate(label):
        if e == 1:
            ris.append(label2emotion[i])
    return ris

emotion_train_dataset = emotion_dataset['train']
emotion_test_dataset = emotion_dataset['test']
emotion_validation_dataset = emotion_dataset['validation']

emotion_train_texts = emotion_train_dataset['text']
emotion_train_labels = emotion_train_dataset['gold_label_list']

for i in range(15,20):
    print(emotion_train_texts[i])
    print(manipulate_emotion_labels(emotion_train_labels[i]))


In [None]:
import json
from tqdm import tqdm

sentiment_prompt = """What is the sentiment of this text? \nText: {text} \nOptions: [ "strongly negative", "negative", "negative or neutral", "positive", "strongly positive"] \nAnswer: {answer}"""
emotion_prompt = """Which emotions from the options below are expressed in the following text? \nText: {text} \nOptions: [ "anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust" ] \nAnswer: {answer}"""

label2emotion = {0 : "anger", 1 : "anticipation", 2 : "disgust", 3 : "fear", 
                 4 : "joy", 5 : "love", 6 : "optimism", 7 : "pessimism", 8 : "sadness", 9 : "surprise", 10 : "trust"}
label2sentiment = {0 : "strongly negative", 1 : "negative", 2 : "negative or neutral", 3 : "positive", 4 : "strongly positive"}

def manipulate_emotion_labels(label):
    ris = []
    for i,e in enumerate(label):
        if e == 1:
            ris.append(label2emotion[i])
    return ", ".join(ris)


def generate_finetuning_dataset(dataset_type, texts, labels):

    json_data = []
    with open(f"training_{dataset_type}.json", "w") as fw_json:
        for instance_data, instance_gold in tqdm(zip(texts, labels), total=len(labels)):
            if dataset_type=="emotion":
                answer = manipulate_emotion_labels(instance_gold)
            else:
                answer = label2sentiment[instance_gold]
            
            prompt_template = emotion_prompt if dataset_type=="emotion" else sentiment_prompt
            prompt = prompt_template.format(
                    text=instance_data,
                    answer=answer)
            json_elem = {"prompt":prompt}
            json_data.append(json_elem)
        json.dump(json_data, fw_json, indent=4)
        
generate_finetuning_dataset("emotion", emotion_train_texts, emotion_train_labels)