In [None]:
import pandas as pd
import numpy as np
import pycountry
import re
import ast
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from datetime import datetime
from gensim.test.utils import get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import country_converter as coco
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer


## Clean and prepare the data

In [None]:
data=pd.read_csv('un_general_debates_extended.csv', index_col=0)


In [None]:
cc = coco.CountryConverter()
country_name_short=data['country_name_short']
un_region_names = cc.convert(names = country_name_short, to = 'UNregion')

data.loc[:,'UN_region']=un_region_names

In [None]:
# convert text data to lower case (for easier analysis)
data.loc[:,'text'] = data['text'].str.lower()
# Remove unusual symbols from description
def clean(s):    
    # Remove any tags:
    cleaned = re.sub(r"(?s)<.?>", " ", s)
    # Keep only regular chars:
    cleaned = re.sub(r"[^A-Za-z0-9(),*!?\'\`]", " ", cleaned)
    # Remove unicode chars
    cleaned = re.sub("\\\\u(.){4}", " ", cleaned)
    # Remove things between brackets
    cleaned = re.sub("\[.*?\]", " ", cleaned)

    return cleaned.strip()

# clean text
data.loc[:,'text'] = data.text.apply(lambda x: clean(x))

In [None]:
# remove data with null value in year column
data = data[data['year'].notnull()]

# # drop session column -- provides no information
# data = data.drop(['session'], axis=1)

In [None]:
from nltk.tokenize import ToktokTokenizer
import string
from sklearn.feature_extraction import text
from functools import reduce
import pandas as pd
import unicodedata
import sys

#====================================================================================#
#
# Description:
# A script to preprocess political texts, with procedural stop word removal.
# For more information, see www.github.com/lrheault/partyembed
#
# Usage:
# python3 preprocess.py [USA/Canada/UK]
#
# @author: L. Rheault
#
#====================================================================================#


tk = ToktokTokenizer()
# For replacement of contractions.
contractions = {"you'd": 'you would', "he'd": 'he would', "she's": 'she is', "where'd": 'where did', "might've": 'might have', "he'll": 'he will', "they'll": 'they will',  "mightn't": 'might not', "you'd've": 'you would have', "shan't": 'shall not', "it'll": 'it will', "mayn't": 'may not', "couldn't": 'could not', "they'd": 'they would', "so've": 'so have', "needn't've": 'need not have', "they'll've": 'they will have', "it's": 'it is', "haven't": 'have not', "didn't": 'did not', "y'all'd": 'you all would', "needn't": 'need not', "who'll": 'who will', "wouldn't've": 'would not have', "when's": 'when is', "will've": 'will have', "it'd've": 'it would have', "what'll": 'what will', "that'd've": 'that would have', "y'all're": 'you all are', "let's": 'let us', "where've": 'where have', "o'clock": 'oclock', "when've": 'when have', "what're": 'what are', "should've": 'should have', "you've": 'you have', "they're": 'they are', "aren't": 'are not', "they've": 'they have', "it'd": 'it would', "i'll've": 'i will have', "they'd've": 'they would have', "you'll've": 'you will have', "wouldn't": 'would not', "we'd": 'we would', "hadn't've": 'had not have', "weren't": 'were not', "i'd": 'i would', "must've": 'must have', "what's": 'what is', "mustn't've": 'must not have', "what'll've": 'what will have', "ain't": 'aint', "doesn't": 'does not', "we'll": 'we will', "i'd've": 'i would have', "we've": 'we have', "oughtn't": 'ought not', "you're": 'you are', "who'll've": 'who will have', "shouldn't": 'should not', "can't've": 'cannot have', "i've": 'i have', "couldn't've": 'could not have', "why've": 'why have', "what've": 'what have', "can't": 'cannot', "don't": 'do not', "that'd": 'that would', "who's": 'who is', "would've": 'would have', "there'd": 'there would', "shouldn't've": 'should not have', "y'all": 'you all', "mustn't": 'must not', "she'll": 'she will', "hadn't": 'had not', "won't've": 'will not have', "why's": 'why is', "'cause": 'because', "wasn't": 'was not', "shan't've": 'shall not have', "ma'am": 'madam', "hasn't": 'has not', "to've": 'to have', "how'll": 'how will', "oughtn't've": 'ought not have', "he'll've": 'he will have', "we'd've": 'we would have', "won't": 'will not', "could've": 'could have', "isn't": 'is not', "she'll've": 'she will have', "we'll've": 'we will have', "you'll": 'you will', "who've": 'who have', "there's": 'there is', "y'all've": 'you all have', "we're": 'we are', "i'll": 'i will', "i'm": 'i am', "how's": 'how is', "she'd've": 'she would have', "sha'n't": 'shall not', "there'd've": 'there would have', "he's": 'he is', "it'll've": 'it will have', "that's": 'that is', "y'all'd've": 'you all would have', "he'd've": 'he would have', "how'd": 'how did', "where's": 'where is', "so's": 'so as', "she'd": 'she would', "mightn't've": 'might not have'}


In [None]:
un_stopwords = ['nations','year','believe','important','assembly','secretary',
                'conference', 'like', 'way', 'state', 'resolution', 
                'government', 'make', 'role', 
                'united nation', 'united nations', 'general assembly', 'republic of', 'secretary general', 'the world', 
                'ibid', 'security council', 'member state', 'country', 'must', 'many'] + list(text.ENGLISH_STOP_WORDS)

def tokenize_text(text, stopwords=un_stopwords):
    text = reduce(lambda a, kv: a.replace(*kv), contractions.items(), text.lower())
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    tokens = tk.tokenize(text)
    
    tokens = [token for token in tokens if len(token) > 3 and
             token not in stopwords and not token.isdigit()]
    
    #remove inflectional endings and get the root word (lemma):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    
    return lemmas

data['clean_text'] = data['text'].apply(tokenize_text)


In [None]:
from gensim.models.phrases import Phrases, Phraser
from gensim.utils import simple_preprocess

# Create a Phrases model to detect common bigrams
bigram_model = Phrases(data['clean_text'].values, min_count=3, threshold=2)

# Convert the bigrams into single tokens
phraser = Phraser(bigram_model)
processed_sentences = [phraser[sentence] for sentence in data['clean_text'].values]

print(processed_sentences)

In [None]:
print('hi')

In [None]:
def create_tag(row):
    country= row["UN_region"].replace(' ', '_')
    if row["year"]<1990:
        time_stamp_cold_war='before'
    else:
        time_stamp_cold_war='after'
    return country + "_" + time_stamp_cold_war

data["tag"] = data.apply(lambda row: create_tag(row), axis=1)

In [None]:
grouped_df=data.groupby('year')['clean_text'].apply(sum).reset_index()
dict_tag_grouped_text = dict(zip(grouped_df['year'], grouped_df['clean_text']))
data.loc[:,'clean_text_merged'] = data['year'].map(lambda x: dict_tag_grouped_text.get(x, None))
data_year=data[['year', 'clean_text_merged']]
data_year.drop_duplicates(subset=['year'], keep='first', inplace=True)
data_year.reset_index(drop=True, inplace=True)

In [None]:
data_year

In [None]:
# grouped_df=data.groupby('tag')['clean_text'].apply(sum).reset_index()
# dict_tag_grouped_text = dict(zip(grouped_df['tag'], grouped_df['clean_text']))
# data.loc[:,'clean_text_merged'] = data['tag'].map(lambda x: dict_tag_grouped_text.get(x, None))
# data=data[['year', 'clean_text', 'tag', 'UN_region', 'clean_text_merged']]
# data.drop_duplicates(subset=['tag'], keep='first', inplace=True)
# data.reset_index(drop=True, inplace=True)
# data

In [None]:
from collections import Counter

# Count the occurrences of each word in the entire corpus
word_counts = Counter(word for text_list in data_year['clean_text_merged'] for word in text_list)

# Filter out words that appear less than three times
words_to_keep = {word for word, count in word_counts.items() if count >= 3}

# Remove the filtered words from the DataFrame
def filter_words(text_list):
    return [word for word in text_list if word in words_to_keep]

# Apply the filter_words function to each row of the 'text' column
data_year['clean_text_merged'] = data_year['clean_text_merged'].apply(filter_words)


In [None]:
documents = [TaggedDocument(data_year['clean_text_merged'].iloc[k], [data_year['year'].iloc[k]]) for k in tqdm(range(len(data_year)))]

model = Doc2Vec(documents, workers=10, vector_size=300, min_count=3, window=10, epochs=5)
fname = f"doc2vec_whole_corpus_year"
model.save(fname)
model = Doc2Vec.load(fname)

In [None]:
data_embed = data_year #why do you apply it on year?
data_embed[f'text_embedding'] = data_embed['year'].progress_apply(lambda x: model.dv[x])
data_embed.to_csv(f'df_text_embedding_year.csv')
#data_embed['clean_text_merged']=data_embed['clean_text_merged'].apply(lambda lst: [item for item in lst if item in set(model.wv.index_to_key)])

In [None]:
# data_embed = data[['tag']].drop_duplicates()
# data_embed[f'text_embedding'] = data_embed['tag'].progress_apply(lambda x: model.dv[x])
# data_embed['clean_text']=data['clean_text']
# data_embed['UN_region']=data['UN_region']
# data_embed['before/after cold war']=data['year'].apply(lambda x: 'before' if x<1990 else 'after')
# data_embed.to_csv(f'df_text_embedding_un_region_cold_war.csv')

In [None]:
data_embed=pd.read_csv('df_text_embedding_un_region_cold_war.csv', index_col=0)
def add_comas(row):
    string_representation=row['text_embedding']
    string_without_whitespace = '['+string_representation[2:]
    string_with_commas = ', '.join(string_without_whitespace.split())
    return string_with_commas 

data_embed['text_embedding']= data_embed.apply(lambda row: add_comas(row), axis=1)
data_embed['text_embedding'] = data_embed['text_embedding'].apply(ast.literal_eval)
data_embed ['clean_text'] = data_embed['clean_text'].apply(ast.literal_eval)


In [None]:
REGION_COL = {
    'Australia and New Zealand': 'blue',
    'Caribbean': 'green',
    'Central America': 'red',
    'Central Asia': 'orange',
    'Eastern Africa': 'purple',
    'Eastern Asia': 'cyan',
    'Eastern Europe': 'magenta',
    'Melanesia': 'yellow',
    'Micronesia': 'lime',
    'Middle Africa': 'pink',
    'Northern Africa': 'brown',
    'Northern America': 'teal',
    'Northern Europe': 'olive',
    'Polynesia': 'navy',
    'South America': 'maroon',
    'South-eastern Asia': 'indigo',
    'Southern Africa': 'gold',
    'Southern Asia': 'tan',
    'Southern Europe': 'skyblue',
    'Western Africa': 'violet',
    'Western Asia': 'crimson',
    'Western Europe': 'salmon'
}


## Unguided analysis (PCA)

In [None]:
data_embed=pd.read_csv('df_text_embedding_year.csv', index_col=0)

data_embed['text_embedding']= data_embed.apply(lambda row: add_comas(row), axis=1)
data_embed['text_embedding'] = data_embed['text_embedding'].apply(ast.literal_eval)
data_embed['clean_text_merged']=data_embed['clean_text_merged'].apply(ast.literal_eval)

In [None]:
data_embed

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


# Définir la taille du graphique
fig, ax = plt.subplots(figsize=(10, 6))

# Function to perform PCA
def perform_pca(data_embed):
    # Convert the embeddings to a numpy array
    X = np.array(data_embed['text_embedding'].tolist())
    
    # Standardize the data
    X_scaled = scaler.fit_transform(X)
    
    # Perform PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    return pca, X_pca

# Perform PCA on df_main
pca_main, X_pca_main = perform_pca(data_embed)

def project_pca(df_subset):
    # X should be a numpy array of shape (n_samples, n_features)
    X_subset = np.array(df_subset['text_embedding'].tolist())
    
    # Step 2: Standardize the data
    X_scaled_subset = scaler.transform(X_subset)
    X_pca_subset = pca_main.transform(X_scaled_subset)

    return X_pca_subset

pca_comp_1={}
pca_comp_2={}
for year in set(data_embed['year'].values):
    # Filtrer les données pour la famille actuelle
    data = data_embed.loc[data_embed['year']==year].reset_index()
    
    # Obtenir les projections 2D pour les embeddings de la famille actuelle
    projections_2D = project_pca(data)
    pca_comp_1[year]=(projections_2D[:,0])
    pca_comp_2[year]=(projections_2D[:,1])
    # Créer un graphique en nuage de points pour la famille actuelle
    ax.scatter(projections_2D[:,0], projections_2D[:,1], alpha=0.7, label=year)
    
data_embed['PCA_component1']=data_embed['year'].map(pca_comp_1)
data_embed['PCA_component2']=data_embed['year'].map(pca_comp_2)
'''
# Parcourir les familles et créer un graphique en nuage de points pour chaque famille
for un_region in set(data_embed['UN_region'].values):
    # Filtrer les données pour la famille actuelle
    data = data_embed.loc[data_embed['UN_region']==un_region].reset_index()
    
    # Obtenir les projections 2D pour les embeddings de la famille actuelle
    projections_2D = project_pca(data)
    
    # Créer un graphique en nuage de points pour la famille actuelle
    ax.scatter(projections_2D[:,0], projections_2D[:,1], alpha=0.7, label=un_region)
    
    # Ajouter l'année sur chaque point, avec une barre de séparation en cas de chevauchement
    for i, year in enumerate(data['before/after cold war']):
        # Vérifier s'il y a un chevauchement avec l'année précédente
        if i > 0 and data['before/after cold war'][i-1] == year:
            # Ajouter une barre de séparation
            ax.plot([projections_2D[i-1,0], projections_2D[i,0]], [projections_2D[i-1,1], projections_2D[i,1]], color=FAMILY_COL[family], linestyle='-', linewidth=0.5)
            # Ajouter l'année avec une légère décalage vertical
            ax.text(projections_2D[i,0], projections_2D[i,1]+0.8, year, ha='center', va='bottom')
        else:
            # Ajouter l'année sans barre de séparation
            ax.text(projections_2D[i,0], projections_2D[i,1]+0.5, year, ha='center', va='bottom')'''

ax.set_title('PCA of Embedded Speeches', fontsize=14)

# Définir la taille de la police de caractère pour les étiquettes d'axes
ax.set_xlabel('Component 1', fontsize=12)
ax.set_ylabel('Component 2', fontsize=12)

# Ajouter une légende
ax.legend()
plt.savefig('PCA_data_by_region.png')
# Afficher le graphique
plt.show()


In [None]:
# Définir la taille du graphique
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data_embed['year'],data_embed['PCA_component1'])
ax.set_title('Component 1 of PCA throughout the years (whole corpus)', fontsize=14)

# Définir la taille de la police de caractère pour les étiquettes d'axes
ax.set_xlabel('year', fontsize=12)
ax.set_ylabel('Component 1', fontsize=12)
plt.show()

In [None]:
# Définir la taille du graphique
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data_embed['year'],data_embed['PCA_component2'])
ax.set_title('Component 2 of PCA throughout the years (whole corpus)', fontsize=14)

# Définir la taille de la police de caractère pour les étiquettes d'axes
ax.set_xlabel('year', fontsize=12)
ax.set_ylabel('Component 2', fontsize=12)
plt.show()

In [None]:
# Calculate the mean values of component 1 and component 2
mean_component1 = np.mean(X_pca_main[:, 0])
mean_component2 = np.mean(X_pca_main[:, 1])

# Identify the indices of data points in the bottom left quadrant
bottom_left_indices = np.where((X_pca_main[:, 0] < mean_component1) & (X_pca_main[:, 1] < mean_component2))[0]

# Retrieve the corresponding words from df_main based on the indices
words_bottom_left = data_embed.iloc[bottom_left_indices]['clean_text_merged'].values

flattened_words_bottom_left = [word for sublist in words_bottom_left for word in sublist]
words_bottom_left=list(set(flattened_words_bottom_left))

# Sort these words based on their distance from the bottom left in the PCA plot
min_component1 = np.min(X_pca_main[:, 0])
min_component2 = np.min(X_pca_main[:, 1])

bottom_left_point = (min_component1, min_component2)

distances_from_bottom_left = np.sqrt((X_pca_main[:, 0] - bottom_left_point[0])**2 + (X_pca_main[:, 1] - bottom_left_point[1])**2)

# Sort the words based on their distances from the origin
sorted_words_bottom_left = [word for _, word in sorted(zip(distances_from_bottom_left[bottom_left_indices], words_bottom_left))]

In [None]:
# Identify the indices of data points in the top left quadrant
top_left_indices = np.where((X_pca_main[:, 0] < mean_component1) & (X_pca_main[:, 1] > mean_component2))[0]

# Retrieve the corresponding words from df_main based on the indices
words_top_left = data_embed.iloc[top_left_indices]['clean_text_merged'].values
# Flatten the list of lists
flattened_words_top_left = [word for sublist in words_top_left for word in sublist]

# Get unique words
words_top_left = list(set(flattened_words_top_left))

# Sort these words based on their distance from the top left in the PCA plot
min_component1 = np.min(X_pca_main[:, 0])
max_component2 = np.min(X_pca_main[:, 1])

top_left_point = (min_component1, max_component2)

distances_from_top_left = np.sqrt((X_pca_main[:, 0] - top_left_point[0])**2 + (X_pca_main[:, 1] - top_left_point[1])**2)

# Sort the words based on their distances from the origin
sorted_words_top_left = [word for _, word in sorted(zip(distances_from_top_left[top_left_indices], words_top_left))]

In [None]:
# Identify the indices of data points in the top right quadrant
top_right_indices = np.where((X_pca_main[:, 0] > mean_component1) & (X_pca_main[:, 1] > mean_component2))[0]

# Retrieve the corresponding words from df_main based on the indices
words_top_right = data_embed.iloc[top_right_indices]['clean_text_merged'].values

# Flatten the list of lists
flattened_words_top_right = [word for sublist in words_top_right for word in sublist]

# Get unique words
words_top_right = list(set(flattened_words_top_right))

# Sort these words based on their distance from the top right in the PCA plot
max_component1 = np.max(X_pca_main[:, 0])
max_component2 = np.max(X_pca_main[:, 1])

top_right_point = (max_component1, max_component2)

distances_from_top_right = np.sqrt((X_pca_main[:, 0] - top_right_point[0])**2 + (X_pca_main[:, 1] - top_right_point[1])**2)

# Sort the words based on their distances from the origin
sorted_words_top_right = [word for _, word in sorted(zip(distances_from_top_right[top_right_indices], words_top_right))]

In [None]:
# Identify the indices of data points in the bottom right quadrant
bottom_right_indices = np.where((X_pca_main[:, 0] > mean_component1) & (X_pca_main[:, 1] < mean_component2))[0]

# Retrieve the corresponding words from df_main based on the indices
words_bottom_right = data_embed.iloc[bottom_right_indices]['clean_text_merged'].values

# Flatten the list of lists
flattened_words_bottom_right = [word for sublist in words_bottom_right for word in sublist]

# Get unique words
words_bottom_right = list(set(flattened_words_bottom_right))

# Sort these words based on their distance from the bottom right in the PCA plot
max_component1 = np.max(X_pca_main[:, 0])
min_component2 = np.min(X_pca_main[:, 1])

bottom_right_point = (max_component1, min_component2)

distances_from_bottom_right = np.sqrt((X_pca_main[:, 0] - bottom_right_point[0])**2 + (X_pca_main[:, 1] - bottom_right_point[1])**2)

# Sort the words based on their distances from the origin
sorted_words_bottom_right = [word for _, word in sorted(zip(distances_from_bottom_right[bottom_right_indices], words_bottom_right))]

In [None]:
# now print the words defining the axes
print("Words closest to the bottom left quadrant of PCA plot:", sorted_words_bottom_left[:5])
print("Words closest to the bottom right quadrant of PCA plot:", sorted_words_bottom_right[:5])
print("Words closest to the top right quadrant of PCA plot:", sorted_words_top_right[:5])
print("Words closest to the top left quadrant of PCA plot:", sorted_words_top_left[:5])


## Guided analysis

In [None]:
tokenized_corpus = data_embed['clean_text_merged'].values

# Seed words

territorial_terms = ['bloc', 'region', 'civilisation', 'country', 'nation']
functional_terms = ['class', 'ideology', 'inequality', 'group', 'social']

seed_words = territorial_terms + functional_terms

# Calculate average vector for seed words
territorial_seed_vectors = sum([model.wv[word] for word in filtered_TERR_FUNC_LEXICON[0] if word in model.wv.index_to_key]) / len(territorial_terms)
functional_seed_vectors = sum([model.wv[word] for word in filtered_TERR_FUNC_LEXICON[1] if word in model.wv.index_to_key]) / len(functional_terms)

In [None]:
def cosine_similarity_row(row, ref_vector):
    # Extract vectors for words that are in the model's vocabulary
    word_vectors = np.array(
        [model.wv[word] for word in row if word in model.wv.index_to_key]
    )
    # Calculate cosine similarity for the word vectors against the reference vector
    similarities = cosine_similarity(word_vectors, [ref_vector])
    # Compute the average cosine similarity
    average_similarity = np.mean(similarities)
    return average_similarity

In [None]:
# Calculate cosine similarity between average embeddings and all embedded speeches

data_embed['territorial_similarity'] = data_embed['clean_text_merged'].progress_apply(lambda x: cosine_similarity_row(x, territorial_seed_vectors))
data_embed['functional_similarity'] = data_embed['clean_text_merged'].progress_apply(lambda x: cosine_similarity_row(x, functional_seed_vectors))

In [None]:
plt.scatter(data_embed['year'], data_embed['functional_similarity'], marker='o', label='functionality')
plt.scatter(data_embed['year'], data_embed['territorial_similarity'], marker='o', color='r', label='territoriality')

# Set the plot title
plt.title('Plot of functionality and territoriality depending on years')

# Add labels to the axes (optional)
plt.xlabel('year')
plt.ylabel('functionality and territoriality')

plt.legend()
# Display the plot
plt.show()

In [None]:
data_embed

In [None]:
# Calculate semantic similarity between each word in the vocabulary and seed words
word_vectors = {word: model.wv[word] for word in model.wv.index_to_key}
word_similarities = {}

for word, vector in word_vectors.items():
    territorial_similarity = cosine_similarity([vector], [territorial_seed_vectors])[0][0]
    functional_similarity = cosine_similarity([vector], [functional_seed_vectors])[0][0]
    word_similarities[word] = (territorial_similarity, functional_similarity)

In [None]:
sorted_word_similarities = sorted(word_similarities.items(), key=lambda x: (x[1][0]), reverse=True)
top_20_words_territorial = sorted_word_similarities[:20]

top_territorial=[word for word, (func_sim, terr_sim) in top_20_words_territorial[:10]]

print("Top 10 words related to high territorial terms:", top_territorial)

In [None]:
sorted_word_similarities = sorted(word_similarities.items(), key=lambda x: (x[1][1]), reverse=True)
top_20_words_functional = sorted_word_similarities[:20]

top_functional=[word for word, (func_sim, terr_sim) in top_20_words_functional[:10]]

print("Top 10 words related to high functional terms:", top_functional)

## Guided projections according to the git

In [None]:
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances


territorial_terms = ['bloc', 'region', 'civilisation', 'country', 'nation']
functional_terms = ['class', 'ideology', 'inequality', 'group', 'social']

def topic_vectors(topicwords, model, n=20):
    M = model.vector_size
    centroids = []

    # Compute the centroid for each topic word and store them in a list
    for topicword in topicwords:
        sims = model.wv.most_similar(topicword, topn=n)
        simw = [topicword] + [w for w, s in sims]
        zsim = np.zeros((n + 1, M))
        for i, w in enumerate(simw):
            zsim[i, :] = model.wv[w]
        centroids.append(zsim.mean(axis=0))

    # Calculate the average centroid across all topic words
    overall_centroid = np.mean(centroids, axis=0)
    return overall_centroid

def bootstrap_topic_vectors(topicwords, model, n=20, sims=1000):
    M = model.vector_size
    boot_results = np.zeros((sims, M))
    
    # Get all topic words to be used in bootstrapping
    topic_words = []
    for topicword in topicwords:
        expanded_word_list = model.wv.most_similar(topicword, topn=n - 1)
        topic_words.extend([topicword] + [w for w, s in expanded_word_list])
    
    # Perform bootstrapping to calculate centroids
    for s in range(sims):
        boot_sample = np.random.choice(topic_words, size=n, replace=True)
        zsim = np.zeros((n, M))
        for i, w in enumerate(boot_sample):
            zsim[i, :] = model.wv[w]
        boot_results[s, :] = zsim.mean(axis=0)

    return boot_results

def cos_sim(speech, topic, boot=True, sims=1000):
    P = speech.shape[0]
    if boot:
        C = cosine_similarity(speech, topic)
        m = np.mean(C, axis=1)
        ci = np.percentile(C, q=[2.5, 97.5], axis=1)
        return m.tolist(), ci[0].tolist(), ci[1].tolist()
    else:
        return cosine_similarity(speech, topic).tolist()

def issue_ownership(model, topicwords, infer_vector=True, t_size=20, boot=True, smooth=True):
    # Get the average centroid for the list of topic words
    if infer_vector:
        if boot:
            t = bootstrap_topic_vectors(topicwords, model, n=t_size, sims=1000)
        else:
            t = topic_vectors(topicwords, model, n=t_size)
    else:
        raise ValueError("Either topic words or topic vectors must be provided")

    res = fit(model, t, smooth=smooth, boot=boot)
    return res

def fit(model, topic_vector, smooth=False, boot=True):
    M = model.vector_size
    years = sorted(list(set(data_embed['year'].values)))
    P = len(years)
    z = np.zeros((P, M))

    # Populate `z` with document vectors for each year
    for i, year in enumerate(years):
        doc_index = next((index for (index, (doc, tag)) in enumerate(documents) if tag == [year]), None)
        if doc_index is not None:
            z[i, :] = model.dv[doc_index]
    
    # Calculate similarity and return results
    C = cos_sim(z, topic_vector, boot=boot)
    res = pd.DataFrame({'year': years, 'similarity': C[0]})

    if smooth:
        res = res.rolling(window=10, center=False).mean()
        res['year'] = years

    return res


In [None]:
sims = model.wv.most_similar('ideology', topn = 10)
sims

In [None]:
corpus_territorial_sim=issue_ownership(model, topicwords=filtered_TERR_FUNC_LEXICON[0], smooth=False, boot=True)
plt.plot(corpus_territorial_sim['year'], corpus_territorial_sim['similarity'], marker='o')

# Set the plot title
plt.title('Plot of territoriality depending on years')

# Add labels to the axes (optional)
plt.xlabel('year')
plt.ylabel('territoriality')

# Display the plot
plt.show()


In [None]:
corpus_functional_sim=issue_ownership(model, topicwords=filtered_TERR_FUNC_LEXICON[1], smooth=False, boot=True)
plt.plot(corpus_functional_sim['year'], corpus_functional_sim['similarity'], marker='o')

# Set the plot title
plt.title('Plot of functionality depending on years')

# Add labels to the axes (optional)
plt.xlabel('year')
plt.ylabel('functionality')

# Display the plot
plt.show()

In [None]:
corpus_trends=pd.DataFrame()
corpus_trends['year']=corpus_functional_sim['year']
corpus_trends['functionality']=corpus_functional_sim['similarity']
corpus_trends['territoriality']=corpus_territorial_sim['similarity']
plt.plot(corpus_trends['year'], corpus_trends['functionality'], marker='o', label='functionality')
plt.plot(corpus_trends['year'], corpus_trends['territoriality'], marker='o', color='r', label='territoriality')

# Set the plot title
plt.title('Plot of functionality and territoriality depending on years')

# Add labels to the axes (optional)
plt.xlabel('year')
plt.ylabel('functionality and territoriality')

plt.legend()
# Display the plot
plt.show()

In [None]:
from gensim.models.doc2vec import Doc2Vec
import numpy as np
import pandas as pd
from sklearn import metrics

#je veux high territorial a droite et high functional en haut
#Chat GPT :
territorial_related_words = ['border','countryside','landscape','province','island',
                             'peninsula','terrain','wilderness','topography','continent', 
                              'state','empire','kingdom','sovereignty','homeland','federation',
                            'rural', 'urban', 'city', ]
functional_related_words = ['globalization','capitalism','technology','economy','industry','digital',
                            'market','finance','innovation','trade']
TERR_FUNC_LEXICON=[ territorial_related_words+territorial_terms, functional_related_words+functional_terms]

filtered_TERR_FUNC_LEXICON = [[word for word in sublist if word in model.wv] for sublist in TERR_FUNC_LEXICON]

print(filtered_TERR_FUNC_LEXICON)

def linear_projection_1D(pVec, vecXLeft, vecXRight):    
    vecX = vecXRight.mean(axis=0) - vecXLeft.mean(axis=0) 
    return np.dot(pVec, vecX)  

def linear_projection_2D(pVec, vecXLeft, vecXRight, vecYDown, vecYUp):    
    vecX = vecXRight.mean(axis=0) - vecXLeft.mean(axis=0) 
    vecY = vecYUp.mean(axis=0) - vecYDown.mean(axis=0)
    return (np.dot(pVec, vecX), np.dot(pVec, vecY)) 

def get_vector(model, words, M):
    words = [w for w in words if w in model.wv.vocab]
    L = len(words)
    temp = np.zeros((L, M))
    for i, x in enumerate(words):
        temp[i,:] = model.wv[x]
    return temp

def custom_projection_1D(z, model, custom_lexicon=None):
    M = model.vector_size
    if custom_lexicon:
        lex = custom_lexicon
        if len(lex)!=2:
            raise ValueError("The custom lexicon should be a list of lists, with two elements.")
    else:
        lex = [filtered_TERR_FUNC_LEXICON[0], filtered_TERR_FUNC_LEXICON[1]] 
    xl, xr = [get_vector(model, words, M) for words in lex] 
    projections = [linear_projection_1D(x, xl, xr) for x in z]
    Z = np.array(projections) 
    return Z


def get_vector(model, words, M):
    words_in_vocab = [w for w in words if w in model.wv]
    words_not_in_vocab = [w for w in words if w not in model.wv]
    if words_not_in_vocab:
        print(f"Words not in vocabulary: {words_not_in_vocab}")
    L = len(words_in_vocab)
    temp = np.zeros((L, M))
    for i, x in enumerate(words_in_vocab):
        temp[i,:] = model.wv[x]
    return temp
def linear_projection_2D(pVec, vecXLeft, vecXRight, vecYDown, vecYUp):
    if vecXLeft.size == 0 or vecXRight.size == 0:
        print("Warning: vecXLeft or vecXRight is empty.")
    if vecYDown.size == 0 or vecYUp.size == 0:
        print("Warning: vecYDown or vecYUp is empty.")

    vecX = np.nanmean(vecXRight, axis=0) - np.nanmean(vecXLeft, axis=0)
    vecY = np.nanmean(vecYUp, axis=0) - np.nanmean(vecYDown, axis=0)
    return (np.dot(pVec, vecX), np.dot(pVec, vecY))

In [None]:
families = data['UN_region'].values

# Définir la taille de la police de caractère
plt.rcParams.update({'font.size': 8})

# Trier les données par année
df_sorted = data_embed.sort_values(by=['year'])

# Définir la taille du graphique
fig, ax = plt.subplots(figsize=(10, 6))

for un_region in set(data_embed['UN_region'].values):
    # Filtrer les données pour la famille actuelle
    data_region = data_embed.loc[data_embed['UN_region']==un_region].reset_index()
    
    # Obtenir les projections 1D pour les embeddings de la famille actuelle
    projections_1D = custom_projection_1D(data_region[f'text_embedding'], model, custom_lexicon=TERR_FUNC_LEXICON)
    
    # Créer un graphique en nuage de points pour la famille actuelle
    ax.scatter(projections_2D[:,0], projections_2D[:,1], alpha=0.7, label=un_region)
    
    # Ajouter l'année sur chaque point, avec une barre de séparation en cas de chevauchement
    for i, before_after_tag in enumerate(data_region['year']):
        # Vérifier s'il y a un chevauchement avec l'année précédente
        if i > 0 and data_region['year'][i-1] == year:
            # Ajouter une barre de séparation
            ax.plot([projections_2D[i-1,0], projections_2D[i,0]], [projections_2D[i-1,1], projections_2D[i,1]], color=REGION_COL[family], linestyle='-', linewidth=0.5)
            # Ajouter l'année avec une légère décalage vertical
            ax.text(projections_2D[i,0], projections_2D[i,1]+0.8, year, ha='center', va='bottom')
        else:
            # Ajouter l'année sans barre de séparation
            ax.text(projections_2D[i,0], projections_2D[i,1]+0.5, year, ha='center', va='bottom')

    
# Ajouter un titre et des labels d'axes
# Définir la taille de la police de caractère pour le titre du graphique

#ax.set_title('Party Placement in the European Parliament (1999-2022)', fontsize=14)

# Définir la taille de la police de caractère pour les étiquettes d'axes
ax.set_xlabel('Component 1', fontsize=12)
ax.set_ylabel('Component 2', fontsize=12)


# Ajouter une légen



# Ajouter une légende
ax.legend()
plt.savefig('PCA_R_L_FR.png')
# Afficher le graphique
plt.show()


In [None]:
model.most_similar('tree')