In [None]:
import pandas as pd
import re
import unidecode
import spacy
from helpers import get_sentiment
from tqdm import tqdm
import glob

tqdm.pandas()

# show all pandas columns
pd.set_option("display.max_columns", None)

In [None]:
# Read all dataframes
file_paths = glob.glob("./football_kits_data/*.xlsx")
dataframes = [pd.read_excel(file) for file in file_paths]
df = pd.concat(dataframes, ignore_index=True).drop_duplicates()
df.head(3)

In [None]:
# Clean
def remove_profile_mentions(text):
    if not isinstance(text, str):
        text = str(text)
    return re.sub(r"@[\w_]+", "", text)


df["Message"] = df["Message"].str.replace("RT ", "")
df["Message"] = df["Message"].progress_apply(remove_profile_mentions)
df["SENTIMENT"] = df["Message"].progress_apply(get_sentiment)

In [None]:
comments = list(df["Message"])


def find_conversations(comments, sentiments) -> dict:
    """
    Find conversations in the comments and count the sentiment of each person mentioned.
    """
    people = {}
    nlp = spacy.load("en_core_web_sm")
    for comment, sentiment in tqdm(zip(comments, sentiments), total=len(comments)):
        doc = nlp(comment)
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                ent = ent.text.lower()
                # Remove accents from the entity name
                ent = unidecode.unidecode(ent)
                if ent not in people:
                    people[ent] = {
                        "POSITIVE": 0,
                        "NEGATIVE": 0,
                        "NEUTRAL": 0,
                        "total": 0,
                    }
                # Update sentiment counts
                people[ent][sentiment] += 1
                people[ent]["total"] += 1
                break
    return people


# Find conversations and sentiments
people = find_conversations(df["Message"], df["Sentiment"])
# Convert the results into a dataframe
people_df = pd.DataFrame.from_dict(people, orient="index").reset_index()
# Rename columns for clarity
people_df.columns = [
    "person",
    "positive_comments",
    "negative_comments",
    "neutral_comments",
    "total_comments",
]
# Sort the dataframe by the number of total comments
people_df = people_df.sort_values(by="total_comments", ascending=False)
# Display the dataframe
people_df

In [None]:
# Save the dataframe to a CSV file
people_df[people_df["total_comments"] > 10].to_csv(
    "./football_kits_data/people.csv", index=False
)