# ðŸ“Œ Data Cleaning and Preparation Notebook  

## ðŸŽ¯ Objective  
This notebook is designed to clean and preprocess the raw dataset for further analysis.
It includes steps such as handling missing values, computing perceptual distances, and performing text preprocessing.

## ðŸ“‘ Table of Contents  

- [Libraries](#toc1_)  
  *Importing necessary libraries for data manipulation, visualization, and NLP.*  
- [Functions](#toc2_)  
  *Defining functions used throughout the notebook.*  
- [Data Preparation](#toc3_)  
  - [Loading](#toc3_1_) â€“ *Load the dataset into a DataFrame.*  
  - [Delete NA](#toc3_2_) â€“ *Remove missing values to ensure data consistency.*  
  - [Compute Emotional Strength](#toc3_3_) â€“ *Derive the emotional strength metric based on pleasantness.*  
  - [Add Memory Column](#toc3_4_) â€“ *Create a new variable to analyze associative memory.*  
  - [Encode Gender](#toc3_5_) â€“ *Convert gender information into a numerical format.*  
  - [Subset DataFrame](#toc3_6_) â€“ *Filter the dataset to retain relevant observations.*  
- [Perceptual Distance](#toc4_)  
  - [Participant Perceptual Space](#toc4_1_) â€“ *Visualize participants' perceptual space.*  
  - [Compute Euclidean Distances](#toc4_2_) â€“ *Measure the perceptual similarity between observations.*  
- [NLP Processing](#toc5_)  
  - [Text Cleaning](#toc5_1_) â€“ *Remove special characters, extra spaces, and unwanted symbols.*  
  - [Normalization](#toc5_2_) â€“ *Standardize text formatting (e.g., lowercasing).*  
  - [Tokenization](#toc5_3_) â€“ *Split text into individual words.*  
  - [Lemmatization](#toc5_4_) â€“ *Reduce words to their base form.*  
  - [Word Count](#toc5_5_) â€“ *Analyze word frequencies.*  
  - [Compute Jaccard Distance](#toc5_6_) â€“ *Measure similarity between text samples using Jaccard distance.*  
- [Save Processed Data](#toc6_)  
  *Export the cleaned and transformed dataset for further use.*  

# <a id='toc1_'></a>[Libraries](#toc0_)

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import spacy

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import art3d

from nltk.corpus import stopwords


from scipy.spatial import ConvexHull, QhullError
from scipy.spatial.distance import euclidean


from spacy.language import Language
from spacy.lookups import Lookups

from exceptions_lemma import lemma_lookup

# <a id='toc2_'></a>[Functions](#toc0_)

In [None]:
def plot_3d_convex_hulls(df, participants, features, show_volumes=False):
    def plot_convex_hull(ax, points, color, show_volumes):
        try:
            hull = ConvexHull(points)
            for simplex in hull.simplices:
                ax.plot(points[simplex, 0], points[simplex, 1], points[simplex, 2], color=color)
            if show_volumes:
                for simplex in hull.simplices:
                    triangle = points[simplex]
                    poly = art3d.Poly3DCollection([triangle], color=color, alpha=0.3)
                    ax.add_collection3d(poly)
            return hull
        except QhullError:
            print(f"Could not compute convex hull for points")
            return None

    fig = plt.figure(figsize=(12, 12))
    ax = fig.add_subplot(111, projection='3d')
    colors = ["#FF6F61", "#6B5B95", "#88B04B"]
    
    # Dictionary to rename participants
    participant_labels = {
        'AB': 'Participant 1',
        'AL': 'Participant 2',
        'AMB': 'Participant 3'
        }

    for i, participant in enumerate(participants):
        participant_points = df[df['participant'] == participant][features].values
        label = participant_labels.get(participant, f'Participant {i + 1}')
        ax.scatter(
            participant_points[:, 0],
            participant_points[:, 1],
            participant_points[:, 2],
            label=label,
            color=colors[i],
            s=100,
            )
        plot_convex_hull(ax, participant_points, colors[i], show_volumes)

    # Adjust axis limits
    ## Pleasantness
    ax.set_xlim([-5.2, 5.2])
    ax.set_xticks(np.arange(-5, 6, 2.5))
    ## Familiarity
    ax.set_ylim([-0.2, 10.2])
    ax.set_yticks(np.arange(0, 11, 2.5))
    ## Intensity
    ax.set_zlim([-0.2, 10.2])
    ax.set_zticks(np.arange(0, 11, 2.5))
    
    # Set axis labels
    ax.set_xlabel('Pleasantness', fontsize=18, labelpad=20)
    ax.set_ylabel('Familiarity', fontsize=18, labelpad=20)
    ax.set_zlabel('Intensity', fontsize=18, labelpad=15)

    # Set tick label font size
    ax.tick_params(axis='x', labelsize=18, length=10, width=4)
    ax.tick_params(axis='y', labelsize=18, length=10, width=4)
    ax.tick_params(axis='z', labelsize=18, length=10, width=4, pad=10)

    # Set axis spines thickness
    ax.xaxis.line.set_linewidth(2)
    ax.yaxis.line.set_linewidth(2)
    ax.zaxis.line.set_linewidth(2)

    # Legend
    ax.legend(
        fontsize=18,
        loc='center left',
        bbox_to_anchor=(1.1, 0.5)
    )

    #ax.view_init(elev=10, azim=300)
    ax.view_init(elev=10, azim=310)

    plt.tight_layout()
    plt.show()

In [None]:
def compute_euclidean_distance(df, features):
    results = []

    for participant, group in df.groupby('participant'):
        odors = group[['odor_name'] + features].reset_index(drop=True)

        # Iterate on all scent combinations
        for i in range(len(odors)):
            for j in range(len(odors)):
                if i != j:
                    odor_i = odors.iloc[i]
                    odor_j = odors.iloc[j]
                    distance = euclidean(
                        [odor_i[feature] for feature in features],
                        [odor_j[feature] for feature in features]
                    )
                    results.append({
                        'participant': participant,
                        'odor_name': odor_i['odor_name'],
                        'odor_name_2': odor_j['odor_name'],
                        'euclidean_distance': distance
                    })

    return pd.DataFrame(results)

In [None]:
# To compute jaccard distance
def jaccard_distance(set1, set2):
    if not set1 and not set2:
        return 0.0  # Both sets are empty, so they are identical
    if not set1 or not set2:
        return 1.0  # One of the sets is empty, so they are completely disjoint
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return 1 - len(intersection) / len(union)

# To compute pairwsise jaccard distance btw odors
def compute_jaccard_distance(df, participant_column='participant', odor_name_column='odor_name', descriptors_column='lemma'):
    results = []

    # Group by participant
    grouped = df.groupby(participant_column)

    for participant, group in grouped:
        # Extract odor_name and descriptors
        odor_name = group[odor_name_column].values
        descriptors = group[descriptors_column].apply(lambda x: set(x) if isinstance(x, list) else set()).values
        
        # Calculate Jaccard distance for each pair of lines
        num_descriptors = len(descriptors)
        for i in range(num_descriptors):
            for j in range(num_descriptors):
                if i != j:
                    jaccard_dist = jaccard_distance(descriptors[i], descriptors[j])
                    results.append({
                        'participant': participant,
                        'odor_name': odor_name[i],
                        'odor_name2': odor_name[j],
                        'descriptors_lem': descriptors[i],
                        'descriptors_lem2': descriptors[j],
                        'jaccard_distance': jaccard_dist
                    })

    return pd.DataFrame(results)

# <a id='toc3_'></a>[Data prep](#toc0_)

## <a id='toc3_1_'></a>[Loading](#toc0_)

In [None]:
# Get the parent directory of the current working directory
project_dir = Path.cwd().parent

# Define the path to the data folder
data_folder = project_dir / "data"

# Define the path to the CSV file
file_path = data_folder / "raw_data.csv"

# Read the CSV file
df = pd.read_csv(file_path)

In [None]:
df.info()

## <a id='toc3_2_'></a>[Delete NA](#toc0_)

In [None]:
# Delete lines where pleasantness, intensity or familiarity contain NaN
df = df.dropna(subset=['pleasantness', 'intensity', 'familiarity'])

## <a id='toc3_3_'></a>[Compute emotional strength](#toc0_)

In [None]:
df.insert(df.columns.get_loc('pleasantness') + 1, 'emotional_strength', df['pleasantness'].abs())

## <a id='toc3_4_'></a>[Add mem col](#toc0_)

In [None]:
# Add column 'mem' to indicate if the odor was associated with a memory
df.insert(df.columns.get_loc('what') + 1, 'mem', df.apply(lambda row: 1 if row['www'] == 1 or row['wwhich'] == 1 else 0, axis=1))

## <a id='toc3_5_'></a>[Encode gender](#toc0_)

In [None]:
# Encode W=0, M=1
df.insert(df.columns.get_loc('gender') + 1, 'gender_encoded', df['gender'].map({'W': 0, 'M': 1}))

In [None]:
df

## <a id='toc3_6_'></a>[Subset df](#toc0_)

In [None]:
# Subset df
df_target = df[df['is_target'] == 1]
df_hit = df[df['hit'] == 1]

# <a id='toc4_'></a>[Perceptual distance](#toc0_)

## <a id='toc4_1_'></a>[Participant perceptual space](#toc0_)

In [None]:
# Features selection
features = ['pleasantness', 'familiarity', 'intensity']

# Participants selection
participants = ['BT', 'PAC', 'DNTT']

# Filter data for selected participants
df_target_filtered = df_target[df_target['participant'].isin(participants)]

# Function
plot_3d_convex_hulls(df_target_filtered, participants, features, show_volumes=True)

In [None]:
# Features selection
features = ['pleasantness', 'familiarity', 'intensity']

# Participants selection
participants = ['BT', 'PAC', 'DNTT']

# Filter data for selected participants
df_hit_filtered = df_hit[df_hit['participant'].isin(participants)]

# Function
plot_3d_convex_hulls(df_hit_filtered, participants, features, show_volumes=True)

## <a id='toc4_2_'></a>[Compute euclidean distances](#toc0_)

In [None]:
# Characteristic definition
features = ['pleasantness', 'intensity', 'familiarity']

# Calculating distances for each DataFrame
euclidean_target = compute_euclidean_distance(df_target, features)
euclidean_hit = compute_euclidean_distance(df_hit, features)

# Average Euclidean distances per scent and participant
mean_euclidean_target = euclidean_target.groupby(['participant', 'odor_name'])[['euclidean_distance']].mean().reset_index()
mean_euclidean_hit = euclidean_hit.groupby(['participant', 'odor_name'])[['euclidean_distance']].mean().reset_index()

# Rename col
mean_euclidean_target = mean_euclidean_target.rename(columns={'euclidean_distance': 'avg_distance_target'})
mean_euclidean_hit = mean_euclidean_hit.rename(columns={'euclidean_distance': 'avg_distance_hit'})

# Merge with df
df = pd.merge(df, mean_euclidean_target, on=['participant', 'odor_name'], how='outer')
df = pd.merge(df, mean_euclidean_hit, on=['participant', 'odor_name'], how='outer')

# <a id='toc5_'></a>[NLP](#toc0_)

## <a id='toc5_1_'></a>[Text cleaning](#toc0_)

In [None]:
# Remove ponctuation and special characters
df['descriptors_clean'] = df['descriptors'].str.replace(r'[^\w\sÃ€-Ã¿]|_', ' ', regex=True)

# Initialize stop words
stop_words_fr = set(stopwords.words('french'))

# Adding words to stop words
to_add = {
    'Ã§a', 'a', 'sd', 'Ã l', 'x', 'pr',
    'odeur', 'odeurs',
    'sent', 'sens', 'sentie',
    }
stop_words_fr.update(to_add)

# Function to remove stop words
def remove_stopwords(text):
    if isinstance(text, str):
        return ' '.join([word for word in text.split() if word.lower() not in stop_words_fr])
    else:
        return text

# Remove stop words
df['descriptors_sw'] = df['descriptors_clean'].apply(remove_stopwords)

## <a id='toc5_2_'></a>[Normalization](#toc0_)

In [None]:
# Lowercase conversion
df['descriptors_lc'] = df['descriptors_sw'].str.lower()

## <a id='toc5_3_'></a>[Tokenization](#toc0_)

In [None]:
# Load spaCy model
nlp = spacy.load("fr_core_news_sm")

# Function to tokenize the col
def spacy_tokenizer(text):
    if pd.isnull(text):
        return text
    else:
        doc = nlp(text)
        return [token.text for token in doc]

# Apply to DF
df['tokens'] = df['descriptors_lc'].apply(spacy_tokenizer)

## <a id='toc5_4_'></a>[Lemmatization](#toc0_)

In [None]:
# Initialize lookups
lookups = Lookups()
lookups.add_table("lemma_lookup", lemma_lookup)

# Define a spaCy component to use the lemmatization table
@Language.component("lemma_correction_component")
def lemma_correction_component(doc):
    lemma_table = lookups.get_table("lemma_lookup")
    for token in doc:
        if token.text in lemma_table:
            token.lemma_ = lemma_table[token.text]
    return doc

# Add component to spaCy pipeline
nlp.add_pipe("lemma_correction_component", after="lemmatizer")

# Function for lemmatizing a list of tokens
def lemmatize_tokens(tokens):
    if not isinstance(tokens, list) or not all(isinstance(token, str) for token in tokens):
        return []                          # Return an empty list if tokens is not a string list
    if not tokens:
        return tokens
    text = " ".join(tokens)                # Join tokens into a chain and analyze with spaCy
    doc = nlp(text)
    return [token.lemma_ for token in doc] # Extract token lemmas
    
# Lemmatize
df['lemma'] = df['tokens'].apply(lemmatize_tokens)

## <a id='toc5_5_'></a>[Word count](#toc0_)

In [None]:
# Function to count words
def count_words(tokens):
    if isinstance(tokens, (list, np.ndarray)):
        return len(tokens)
    else:
        return 0

# Word count
df['nb_words'] = df['lemma'].apply(count_words)

## <a id='toc5_6_'></a>[Compute Jaccard Distance](#toc0_)

In [None]:
# Subset df
df_target = df[df['is_target'] == 1]
df_hit = df[df['hit'] == 1]

In [None]:
# Pairwise jaccard distance
jaccard_target = compute_jaccard_distance(df_target, descriptors_column='lemma')
jaccard_hit = compute_jaccard_distance(df_hit, descriptors_column='lemma')

# Mean jaccard dist by_odor by_subj
mean_jaccard_target = jaccard_target.groupby(['participant', 'odor_name'])[['jaccard_distance']].mean()
mean_jaccard_hit = jaccard_hit.groupby(['participant', 'odor_name'])[['jaccard_distance']].mean()

# Rename col
mean_jaccard_target = mean_jaccard_target.rename(columns={'jaccard_distance': 'mean_lemma_jaccard_target'})
mean_jaccard_hit = mean_jaccard_hit.rename(columns={'jaccard_distance': 'mean_lemma_jaccard_hit'})

# Merge with df
df = pd.merge(df, mean_jaccard_target, on=['participant', 'odor_name'], how='outer')
df = pd.merge(df, mean_jaccard_hit, on=['participant', 'odor_name'], how='outer')

In [None]:
df

# <a id='toc6_'></a>[Save](#toc0_)

In [None]:
# Select col to save (to heavy w/ all col)
col_selection = [
    "study", "participant","gender", "gender_encoded",
    "pres_order", "odor_num", "odor_name", "is_target", "day",
    "hit", "cr", "www", "wwhich", "what", "mem",
    "pleasantness", "emotional_strength", "intensity", "familiarity",
    "avg_distance_target", "avg_distance_hit",
    "descriptors",
    "tokens", "lemma",
    "nb_words",
    "mean_lemma_jaccard_target", "mean_lemma_jaccard_hit"
    ]

df_sel = df[col_selection]

# Save df
csv_path = data_folder / 'dataset.csv'
df_sel.to_csv(csv_path, index=False)