#### Toulouse PB Election Data Cleaning

In [None]:
import sys
import os
from dotenv import load_dotenv
sys.path.append(os.path.abspath("../src"))

from data_loader import load_and_prepare_projects

path_22 = '../data/projects2022.csv'
path_24 = '../data/projects2024.csv'

df, _ = load_and_prepare_projects(path_22, path_24)
df24 = df[df['year'] == 2024]
df22 = df[df['year'] == 2022]

#### 3. Topic Extraction
I give 50 random projects from 2022 and 2024 (Title + Description), LLM reads them together and extracts relevants topics list.

In [None]:
import pandas as pd

topic_input_df = pd.concat([df22.filter(['project_id', 'project_name', 'description']),
                            df24.filter(['project_id', 'project_name', 'description'])
                            ])

topic_input_df_sample = topic_input_df.sample(n=50, random_state=42)

print(topic_input_df_sample[['project_name']].head())

In [None]:
project_name = df22.project_name[0]
description = df22.description[0]


prompt = """Je vais te présenter une liste de 50 projets citoyens, chacun avec un titre et une description. Ta tâche consiste à analyser l’ensemble des projets et à extraire une liste de thèmes ou de sujets communs qui représentent les principales orientations ou problématiques abordées par ces initiatives.

Chaque thème doit être :

Concis et représentatif.

Basé sur le contenu réel des projets, sans être inventé.

Accompagné d’une brève description (1 à 2 phrases).

Merci de fournir une liste structurée de 20 à 30 thèmes, classés par ordre de pertinence.

Voici la liste des projets :
"""

project_list_text = ""
for i, row in topic_input_df_sample.iterrows():
    project_list_text += f"{i+1}. Titre: {row['project_name']}\n   Description: {row['description']}\n\n"

full_prompt = prompt + " \n" + project_list_text

print(full_prompt)

In [None]:
import openai
import os 

client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY')) 

full_prompt = prompt + "\n" + project_list_text


response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {"role": "user", "content": full_prompt}
    ],
    temperature=0.4,
    max_tokens=4000
)

output_text = response.choices[0].message.content
print(output_text)

In [None]:
import pandas as pd
import re

pattern = r"\d+\.\s+\*\*(.*?)\*\*\s+- Description:\s*(.*?)\n(?:\n|$)"
matches = re.findall(pattern, output_text, re.DOTALL)

df_topics = pd.DataFrame(matches, columns=["topic", "description"])
df_topics.to_csv("data/topics_output.csv", sep = ";", index=False, encoding="utf-8")

df_topics

### 4. Topics Refinement

#### 4. Projects Clasification into Topics

In [None]:
def classification_prompt(topics, project_name, description):
    
    prompt = f"""Je vais te présenter un projet citoyen qui a été proposé dans le cadre d'une élection de budgets participatifs a Toulouse, France. Tu vas lire le titre et la description du projet, ainsi qu'une liste de sujets préalablement définis.

    Ta tâche consiste à analyser le projet et à choisir le sujet le plus représentatif parmi ceux disponibles.

    Si tu considères qu'aucun des sujets existants ne représente correctement le projet, tu peux proposer un nouveau sujet. Ce nouveau sujet doit être :

    - Concis et représentatif.
    - Non inventé ni trop générique.
    - Basé sur le contenu réel du projet.
    - Il doit inclure une courte description du sujet.

    Voici la liste des sujets : {topics}

    Titre du projet : {project_name}
    Description : {description}

    Réponds uniquement par le sujet choisi ou, si nécessaire, le nouveau sujet créé. Pas d'explication supplémentaire.
    """
    
    return prompt


In [None]:
import openai
import os 

client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY')) 

topics = ", ".join(df_topics.topic.tolist())

res  = []
for i in df24.index:
    
    project_name = df24.project_name[i]
    description = df24.description[i]
    prompt = classification_prompt(topics, project_name, description)

    response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {"role": "user", "content": prompt}
    ],
    temperature=0.4,
    max_tokens=4000)
    
    output_text = response.choices[0].message.content
    
    d = {'project_id': df24.project_id[i], 'out': output_text}
    
    print(d)
    res.append(d)



In [None]:
class2024 = pd.DataFrame(res)
#class2024.to_csv("data/project_topics_gpt4turbo.csv", index=False ,sep=";")

#### 5. Get Project Embeddings (OpenAI Embeddings)

In [None]:
import numpy as np

client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY')) 

def get_embedding(text):
    response = client.embeddings.create(
        model="text-embedding-3-large", 
        input=text
    )
    return response.data[0].embedding


df24['text'] = df24['project_name'] + ": " + df24['description']
df24['embedding'] = df24['text'].apply(lambda x: get_embedding(x))
df24['embedding'] = df24['embedding'].apply(lambda x: np.array(x).tolist())

In [None]:
df24

In [None]:
#df2024.filter(['project_id', 'project_name', 'embedding']).to_csv("proj2024_embeddings_openai-3-large.csv", index=False, encoding="utf-8")

#### All-Dataset Visualization

In [None]:
import ast

embedding_df2022 = pd.read_csv('data/proj2022_embeddings_openai-3-large.csv')
embedding_df2022['embedding'] = embedding_df2022['embedding'].apply(ast.literal_eval)

embedding_df2024 = pd.read_csv('data/proj2024_embeddings_openai-3-large.csv')
embedding_df2024['embedding'] = embedding_df2024['embedding'].apply(ast.literal_eval)

embedding_df = pd.concat([embedding_df2022, embedding_df2024]).reset_index(drop=True)
embedding_df['embedding'] = embedding_df['embedding'].apply(np.array)

embedding_df

In [None]:
class2022= pd.read_csv('data/proj2022_topics_gpt4-turbo.csv', sep=";")
class2024= pd.read_csv('data/proj2024_topics_gpt4-turbo.csv', sep=";")

pclass = pd.concat([class2022,class2024])
pclass

t1 = pd.merge(left=embedding_df, right=pclass, on='project_id', how = "left")
t1

In [None]:
from sklearn.decomposition import PCA

embeddings_array = np.array(t1.embedding.values.tolist())
print(embeddings_array.shape)

pca_model = PCA(n_components = 2)
pca_model.fit(embeddings_array)

pca_embeddings_values = pca_model.transform(embeddings_array)
print(pca_embeddings_values.shape)

In [None]:
import plotly.express as px
import plotly

fig = px.scatter(
    x = pca_embeddings_values[:,0], 
    y = pca_embeddings_values[:,1],
    color = t1['out'].values,
    hover_name = t1['project_name'].values,
    title = 'OpenAI 3 - Large Embeddings Model. Projects 2022 / 2024', width = 800, height = 600,
    color_discrete_sequence = plotly.colors.qualitative.Alphabet_r
)

fig.update_layout(
    xaxis_title = 'first component', 
    yaxis_title = 'second component')
fig.show()

In [None]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, random_state=42)
tsne_embeddings_values = tsne_model.fit_transform(embeddings_array)

fig = px.scatter(
    x=tsne_embeddings_values[:, 0],
    y=tsne_embeddings_values[:, 1],
    color=t1['out'].values,
    hover_name=t1['project_name'].values,
    #size=df2022['votes'].values,
    size_max=30,                    
    title='t-SNE embeddings. Proj 2022 / 2024',
    width=800,
    height=600,
    color_discrete_sequence=plotly.colors.qualitative.Alphabet_r
)

fig.update_layout(
    xaxis_title='first component', 
    yaxis_title='second component'
)
fig.show()

#### 6. Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import tqdm

silhouette_scores = []
for k in tqdm.tqdm(range(2, 51)):
    kmeans = KMeans(n_clusters=k, 
                    random_state=42, 
                    n_init = 'auto').fit(embeddings_array)
    kmeans_labels = kmeans.labels_
    silhouette_scores.append(
        {
            'k': k,
            'silhouette_score': silhouette_score(embeddings_array, 
                kmeans_labels, metric = 'cosine')
        }
    )

fig = px.line(pd.DataFrame(silhouette_scores).set_index('k'),
       title = '<b>Silhouette scores for K-means clustering</b>',
       labels = {'value': 'silhoutte score'}, 
       color_discrete_sequence = plotly.colors.qualitative.Alphabet)
fig.update_layout(showlegend = False)

In [None]:
kmeans = KMeans(n_clusters=6, 
                    random_state=42, 
                    n_init = 'auto').fit(embeddings_array)
kmeans_labels = kmeans.labels_

In [None]:
#tsne_model = TSNE(n_components=2, random_state=42)
#tsne_embeddings_values = tsne_model.fit_transform(embeddings_array)

pca_model = PCA(n_components = 2)
pca_model.fit(embeddings_array)
pca_embeddings_values = pca_model.transform(embeddings_array)

fig = px.scatter(
    #x = tsne_embeddings_values[:,0], 
    #y = tsne_embeddings_values[:,1],
    x = pca_embeddings_values[:,0],
    y = pca_embeddings_values[:,1],
    color = list(map(lambda x: 'cluster %s' % x, kmeans_labels)),
    hover_name = t1.project_name.values,
    #size=t1['votes'].values,
    size_max=30,           
    title = 'Clusters over Large Embedding Model. Projects 2022 / 2024', width = 800, height = 600,
    color_discrete_sequence = plotly.colors.qualitative.Alphabet_r
)
fig.update_layout(
    xaxis_title = 'first component', 
    yaxis_title = 'second component')
fig.show()

In [None]:
t1['cluster'] = list(map(lambda x: 'cluster %s' % x, kmeans_labels))
cluster_stats_df = t1.reset_index().pivot_table(
    index = 'cluster', values = 'project_id', 
    aggfunc = 'count', columns = 'out').fillna(0).applymap(int)

cluster_stats_df = cluster_stats_df.apply(
  lambda x: 100*x/cluster_stats_df.sum(axis = 1))

fig = px.imshow(
    cluster_stats_df.values, 
    x = cluster_stats_df.columns,
    y = cluster_stats_df.index,
    text_auto = '.2f', aspect = "auto",
    labels=dict(x="cluster", y="fact topic", color="share, %"), 
    color_continuous_scale='pubugn',
    title = '<b>Share of topics in each cluster</b>', height = 550)

fig.show()

In [None]:
t1[(t1.out=='Accessibilité et Mobilité Améliorée') & (t1.cluster=='cluster 1')]

In [None]:
#t1.filter(['project_id', 'cluster']).to_csv('proj2022_cluster.csv', sep=";", index=False)

#### Create a Project Vector Dataframe
Create a project Vector with: Cost, District, Topic Cluster, PCA - Embedding

In [None]:
t1 = df22.filter(['project_id', 'cost', 'district_n','votes']).sort_values(by='votes', ascending=False).reset_index(drop=True)
t1['ranking'] = t1.index +1

t1

In [None]:
## Semantic Embeddings
import ast

embedding2022 = pd.read_csv('data/proj2022_embeddings_openai-3-large.csv')
embedding2022['embedding'] = embedding2022['embedding'].apply(ast.literal_eval)
embedding2022

t2 = pd.merge(left=t1, right=embedding2022.filter(['project_id', 'embedding']), on='project_id', how = "left")

In [None]:
t2

In [None]:
from sklearn.decomposition import PCA

embeddings_array = np.array(t2.embedding.values.tolist())
print(embeddings_array.shape)

pca_model = PCA(n_components = 2)
pca_model.fit(embeddings_array)

pca_embeddings_values = pca_model.transform(embeddings_array)
print(pca_embeddings_values.shape)


In [None]:
## District Dummy - Cost Norm
dist_dummies = pd.get_dummies(t1['district_n'], prefix='dist_').astype(int)
dist_dummies

t2['cost_n'] = (t2['cost'] - t2['cost'].min()) / (t2['cost'].max() - t2['cost'].min())
t2['dist_v'] = dist_dummies.values.tolist()
t2

In [None]:
np.concatenate([t2.filter(['cost']).values,pca_embeddings_values], axis = 1)

In [None]:
t2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

X = np.array(embedding_df['embedding'].tolist())

pca = PCA()
pca.fit(X)

explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(10, 6))
plt.plot(explained_variance_ratio, marker='o')
plt.xlabel('Número de componentes')
plt.ylabel('Varianza explicada acumulada')
plt.title('Varianza explicada acumulada por número de componentes PCA')
plt.grid(True)
plt.axhline(y=0.90, color='r', linestyle='--', label='90% varianza explicada')
plt.axhline(y=0.95, color='g', linestyle='--', label='95% varianza explicada')
plt.legend()
plt.show()


In [None]:
pca = PCA(n_components=150)
X_reduced = pca.fit_transform(X)

X_reduced.shape