In [None]:
from google.colab import drive
drive.mount('/content/drive')
import spacy
import pandas as pd

In [None]:
nlp = spacy.load("fr_core_news_sm")

file_path = '/content/drive/MyDrive/Thesis/data/UPDATED_parisien_quotes_lemmatized_french.csv'
standard_french_df = pd.read_csv(file_path)
file_path = '/content/drive/MyDrive/Thesis/data/UPDATED_cameroun_quotes_lemmatized_french.csv'
cameroonian_french_df = pd.read_csv(file_path)


In [None]:
standard_french_df.head()

In [None]:
import pandas as pd
from collections import Counter
import string

french_determiners = {
    'le', 'la', 'les', 'un', 'une', 'des', 'du', 'de', 'au', 'aux',
    'ce', 'cet', 'cette', 'ces', 'mon', 'ma', 'mes', 'se', 'on',
    'ton', 'ta', 'tes', 'son', 'sa', 'ses', 'notre', 'nos',
    'votre', 'vos', 'leur', 'leurs', 'l', 'n', 'c', '»', '«'
}

punctuation = set(string.punctuation + "«»’[]…“”")

def clean_and_filter_tokens(df):
    return [
        word.lower().strip(string.punctuation + "’\"“”")  # Strip quotes/punct
        for sentence in df['Quote'].dropna()
        for word in sentence.split()
        if word.lower().strip(string.punctuation + "’\"“”") not in french_determiners
        and word.strip() not in punctuation
        and len(word.strip()) > 1
    ]

standard_tokens = clean_and_filter_tokens(standard_french_df)
cameroonian_tokens = clean_and_filter_tokens(cameroonian_french_df)

standard_freq = Counter(standard_tokens).most_common(30)
cameroonian_freq = Counter(cameroonian_tokens).most_common(30)

standard_freq_df = pd.DataFrame(standard_freq, columns=['Word_Standard', 'Frequency_Standard'])
cameroonian_freq_df = pd.DataFrame(cameroonian_freq, columns=['Word_Cameroon', 'Frequency_Cameroon'])
freq_table = pd.concat([standard_freq_df, cameroonian_freq_df], axis=1)
print(freq_table)

In [None]:
import matplotlib.pyplot as plt

def plot_frequency_chart(freq_df):
    plt.rcParams.update({
        'font.size': 22,
        'axes.titlesize': 22,
        'axes.labelsize': 16,
        'xtick.labelsize': 16,
        'ytick.labelsize': 16,
    })

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 12), sharey=True)

    # Standard French
    axes[0].barh(freq_df['Word_Standard'], freq_df['Frequency_Standard'], color='skyblue')
    axes[0].set_title('Top 30 Words - Standard French')
    axes[0].invert_yaxis()
    axes[0].set_xlabel('Frequency')

    # Cameroonian French
    axes[1].barh(freq_df['Word_Cameroon'], freq_df['Frequency_Cameroon'], color='lightcoral')
    axes[1].set_title('Top 30 Words - Cameroonian French')
    axes[1].set_xlabel('Frequency')

    plt.tight_layout()
    plt.show()

plot_frequency_chart(freq_table)


# Parsing

In [None]:
def parse_sentences(df):
    parsed_data = []
    for sentence in df['lemmatized_sentence']:
        doc = nlp(sentence)
        pos_tags = [token.pos_ for token in doc]
        dependencies = [token.dep_ for token in doc]
        parsed_data.append({
            'sentence': sentence,
            'pos_tags': pos_tags,
            'dependencies': dependencies
        })
    return pd.DataFrame(parsed_data)

standard_french_df['lemmatized_sentence'] = standard_french_df['lemmatized_quote'].apply(lambda tokens: ' '.join(eval(tokens)))
cameroonian_french_df['lemmatized_sentence'] = cameroonian_french_df['lemmatized_quote'].apply(lambda tokens: ' '.join(eval(tokens)))

standard_parsed = parse_sentences(standard_french_df)
cameroonian_parsed = parse_sentences(cameroonian_french_df)

In [None]:
standard_parsed.to_csv("standard_french_parsed.csv", index=False, encoding="utf-8")
cameroonian_parsed.to_csv("cameroonian_french_parsed.csv", index=False, encoding="utf-8")

print("CSV files saved successfully!")

In [None]:
df_check_std = pd.read_csv("standard_french_parsed.csv")
df_check_cmr = pd.read_csv("cameroonian_french_parsed.csv")

print("Standard French Data Sample:")
print(df_check_std.head())

print("\nCameroonian French Data Sample:")
print(df_check_cmr.head())

In [None]:
standard_parsed["Dialect"] = "Standard French"
cameroonian_parsed["Dialect"] = "Cameroonian French"

combined_df = pd.concat([standard_parsed, cameroonian_parsed])

combined_df.to_csv("combined_french_parsed.csv", index=False, encoding="utf-8")

print("Combined CSV file saved successfully!")


In [None]:
from collections import Counter
from scipy.stats import chi2_contingency

In [None]:
def get_pos_frequencies(parsed_df):
    pos_counter = Counter()
    for tags in parsed_df['pos_tags']:
        pos_counter.update(tags)
    return pos_counter

In [None]:
standard_pos_freq = get_pos_frequencies(standard_parsed)
print("Standard French POS Tag Frequencies:", standard_pos_freq)
cameroonian_pos_freq = get_pos_frequencies(cameroonian_parsed)
print("Cameroon French POS Tag Frequencies:", cameroonian_pos_freq)

In [None]:
def get_dependency_frequencies(parsed_df):
    dep_counter = Counter()
    for deps in parsed_df['dependencies']:
        dep_counter.update(deps)
    return dep_counter

In [None]:
standard_dep_freq = get_dependency_frequencies(standard_parsed)
print("Standard French Dependency Frequencies:", standard_dep_freq)
cameroonian_dep_freq = get_dependency_frequencies(cameroonian_parsed)
print("Cameroon French Dependency Frequencies:", cameroonian_dep_freq)

# Visualization

In [None]:
def extract_pos_sequences(parsed_data, sequence_length=2):
    sequences = []
    for pos_tags in parsed_data["pos_tags"]:
        if isinstance(pos_tags, list):
            sequences.extend(zip(*[pos_tags[i:] for i in range(sequence_length)]))
    return sequences

standard_sequences = extract_pos_sequences(standard_parsed, sequence_length=2)
cameroonian_sequences = extract_pos_sequences(cameroonian_parsed, sequence_length=2)

standard_seq_freq = Counter(standard_sequences)
cameroonian_seq_freq = Counter(cameroonian_sequences)


In [None]:
import pandas as pd

all_sequences = set(standard_seq_freq.keys()).union(set(cameroonian_seq_freq.keys()))
data = {
    "Sequence": [" -> ".join(seq) for seq in all_sequences],
    "Standard French": [standard_seq_freq[seq] for seq in all_sequences],
    "Cameroonian French": [cameroonian_seq_freq[seq] for seq in all_sequences]
}
df = pd.DataFrame(data)

# Normalize frequencies per 1,000 sequences
df["Standard French (per 1k)"] = df["Standard French"] / sum(standard_seq_freq.values()) * 1000
df["Cameroonian French (per 1k)"] = df["Cameroonian French"] / sum(cameroonian_seq_freq.values()) * 1000

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [None]:
df["Difference"] = abs(df["Standard French (per 1k)"] - df["Cameroonian French (per 1k)"])
top_differences = df.nlargest(10, "Difference")

plt.figure(figsize=(12, 6))
sns.barplot(
    x="Difference",
    y="Sequence",
    data=top_differences,
    palette="coolwarm"
)
plt.title("Top 10 POS Sequence Frequency Differences (Standard vs. Cameroonian French)")
plt.xlabel("Absolute Difference (per 1k sequences)")
plt.ylabel("POS Sequence")
plt.show()

In [None]:
from scipy.stats import chi2_contingency

results = []

total_standard = sum(standard_seq_freq.values())
total_cameroonian = sum(cameroonian_seq_freq.values())

for seq in all_sequences:
    a = standard_seq_freq.get(seq, 0)
    c = cameroonian_seq_freq.get(seq, 0)
    b = total_standard - a
    d = total_cameroonian - c

    contingency = [[a, b], [c, d]]
    chi2, p, dof, expected = chi2_contingency(contingency)

    results.append({
        "Sequence": " -> ".join(seq),
        "Standard Count": a,
        "Cameroonian Count": c,
        "Chi2": chi2,
        "p-value": p
    })

chi2_df = pd.DataFrame(results)
significant_df = chi2_df[chi2_df["p-value"] < 0.05].sort_values("p-value")

print(significant_df)

In [None]:
from statsmodels.stats.multitest import multipletests

pvals = chi2_df["p-value"]
_, corrected_pvals, _, _ = multipletests(pvals, method='fdr_bh')
chi2_df["corrected p-value"] = corrected_pvals
significant_corrected = chi2_df[chi2_df["corrected p-value"] < 0.05].sort_values("corrected p-value")

print(significant_corrected.head(30))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))

top_n = 40

plot_top = significant_corrected.head(top_n)

# .sort_values("corrected p-value", ascending=True)

sns.barplot(
    data=plot_top,
    x="corrected p-value",
    y="Sequence",
    palette="viridis_r"
)


plt.xlabel("p-value")
plt.ylabel("POS Sequence")
plt.xscale("log")
plt.xticks(fontsize=18)
plt.yticks(fontsize=14)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()



In [None]:
pivot_data = df.melt(
    id_vars="Sequence",
    value_vars=["Standard French (per 1k)", "Cameroonian French (per 1k)"],
    var_name="Variety",
    value_name="Frequency"
)

heatmap_data = pivot_data.pivot_table(
    index="Sequence",
    columns="Variety",
    values="Frequency",
    aggfunc='first'
)
heatmap_data = heatmap_data.loc[heatmap_data.max(axis=1) > 10]
plt.figure(figsize=(14, 16))
sns.heatmap(
    heatmap_data,
    annot=True,
    fmt=".1f",
    cmap="viridis",
    cbar_kws={'label': 'Frequency per 1k Sequences'}
)
plt.title("POS Sequence Frequency Heatmap")
plt.ylabel("POS Sequence")
plt.xlabel("Variety")
plt.yticks(rotation=0)
plt.show()

In [None]:
def get_dependency_vectors(df):
    dependency_vectors = []
    for sentence in df['lemmatized_sentence']:
        doc = nlp(sentence)
        dep_str = ' '.join([f"{token.dep_}_{token.head.pos_}_{token.pos_}" for token in doc])
        dependency_vectors.append(dep_str)
    return dependency_vectors

standard_french_df['dependency_structure'] = get_dependency_vectors(standard_french_df)
cameroonian_french_df['dependency_structure'] = get_dependency_vectors(cameroonian_french_df)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
vectorizer = CountVectorizer()
X_standard = vectorizer.fit_transform(standard_french_df['dependency_structure'])
X_cameroon = vectorizer.fit_transform(cameroonian_french_df['dependency_structure'])

pca = PCA(n_components=50)
X_pca_standard = pca.fit_transform(X_standard.toarray())
X_pca_cameroon = pca.fit_transform(X_cameroon.toarray())

In [None]:
from sklearn.cluster import KMeans

n_clusters = 20
kmeans_standard = KMeans(n_clusters=n_clusters, random_state=0)
standard_french_df['cluster'] = kmeans_standard.fit_predict(X_pca_standard)

kmeans_cameroon = KMeans(n_clusters=n_clusters, random_state=0)
cameroonian_french_df['cluster'] = kmeans_cameroon.fit_predict(X_pca_cameroon)

print(standard_french_df[['lemmatized_sentence', 'cluster']].head())
print(cameroonian_french_df[['lemmatized_sentence', 'cluster']].head())


In [None]:
pca_2d = PCA(n_components=2)
X_pca_combined = np.vstack([X_pca_standard, X_pca_cameroon])
X_2d_combined = pca_2d.fit_transform(X_pca_combined)

X_2d_standard = X_2d_combined[:len(X_pca_standard)]
X_2d_cameroon = X_2d_combined[len(X_pca_standard):]

In [None]:
pca_2d = PCA(n_components=2)
X_2d_standard = pca_2d.fit_transform(X_pca_standard)

plt.figure(figsize=(10, 6))
for cluster in range(n_clusters):
    plt.scatter(X_2d_standard[standard_french_df['cluster'] == cluster, 0], X_2d_standard[standard_french_df['cluster'] == cluster, 1], label=f'Cluster {cluster}')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.title('Sentence Clusters based on Dependency Structure for Standard French')
plt.show()

In [None]:
import numpy as np
import pandas as pd

df_standard = pd.DataFrame(X_2d_standard, columns=['PCA1', 'PCA2'])

Q1 = df_standard.quantile(0.25)
Q3 = df_standard.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

filtered_df_standard = df_standard[~((df_standard < lower_bound) | (df_standard > upper_bound)).any(axis=1)]
filtered_clusters = standard_french_df.loc[filtered_df_standard.index, 'cluster']


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
for cluster in range(n_clusters):
    cluster_points = filtered_df_standard[filtered_clusters == cluster]
    plt.scatter(cluster_points['PCA1'], cluster_points['PCA2'], label=f'Cluster {cluster}', alpha=0.7)

plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.title('Sentence Clusters (Without Outliers) for Standard French')
plt.show()


In [None]:
pca_2d = PCA(n_components=2)
X_2d_cameroon = pca_2d.fit_transform(X_pca_cameroon)

plt.figure(figsize=(10, 6))
for cluster in range(n_clusters):
    plt.scatter(X_2d_cameroon[cameroonian_french_df['cluster'] == cluster, 0], X_2d_cameroon[cameroonian_french_df['cluster'] == cluster, 1], label=f'Cluster {cluster}')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.title('Sentence Clusters based on Dependency Structure for Cameroonian French')
plt.show()

In [None]:
def prepare_chi_square_data(freq_dict1, freq_dict2):
    all_keys = set(freq_dict1.keys()).union(set(freq_dict2.keys()))
    data = [[freq_dict1.get(k, 0), freq_dict2.get(k, 0)] for k in all_keys]
    return data

pos_data = prepare_chi_square_data(standard_pos_freq, cameroonian_pos_freq)
chi2, p, _, _ = chi2_contingency(pos_data)
print(f"POS Tag Chi-square test: chi2={chi2}, p-value={p}")

dep_data = prepare_chi_square_data(standard_dep_freq, cameroonian_dep_freq)
chi2, p, _, _ = chi2_contingency(dep_data)
print(f"Dependency Chi-square test: chi2={chi2}, p-value={p}")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

centroids_standard = kmeans_standard.cluster_centers_
centroids_cameroon = kmeans_cameroon.cluster_centers_

similarity_matrix = cosine_similarity(centroids_standard, centroids_cameroon)
closest_clusters = np.argmax(similarity_matrix, axis=1)

for i, match in enumerate(closest_clusters):
    print(f"Standard French Cluster {i} ↔ Cameroonian French Cluster {match} (Similarity: {similarity_matrix[i, match]:.2f})")


In [None]:
matched_cluster = 11

# Get sentences from matched clusters
standard_sentences = standard_french_df[standard_french_df['cluster'] == matched_cluster]['lemmatized_sentence']
cameroonian_sentences = cameroonian_french_df[cameroonian_french_df['cluster'] == closest_clusters[matched_cluster]]['lemmatized_sentence']

print("Standard French Sentences:")
print(standard_sentences.head())

print("\nCameroonian French Sentences:")
print(cameroonian_sentences.head())


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

plt.scatter(X_pca_standard[standard_french_df['cluster'] == matched_cluster, 0],
            X_pca_standard[standard_french_df['cluster'] == matched_cluster, 1],
            color='blue', label=f'Standard French Cluster {matched_cluster}')

X_pca_cameroon_2d = PCA(n_components=2).fit_transform(X_pca_cameroon)  # Reduce to 2D
plt.scatter(X_pca_cameroon_2d[cameroonian_french_df['cluster'] == closest_clusters[matched_cluster], 0],
            X_pca_cameroon_2d[cameroonian_french_df['cluster'] == closest_clusters[matched_cluster], 1],
            color='red', label=f'Cameroonian French Cluster {closest_clusters[matched_cluster]}')

plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.title(f'Matching Clusters: Standard vs Cameroonian French')
plt.show()


# Embeddings

In [None]:
import pandas as pd

df_standard = pd.read_csv("standard_french_parsed.csv")
df_cameroonian = pd.read_csv("cameroonian_french_parsed.csv")

df_standard["sentence"].to_csv("standard_french_sentences.txt", index=False, header=False)
df_cameroonian["sentence"].to_csv("cameroonian_french_sentences.txt", index=False, header=False)

print("Text files created successfully!")

In [None]:
import os
import fasttext

print("Standard French Model Exists:", os.path.exists("standard_french_fasttext.bin"))
print("Cameroonian French Model Exists:", os.path.exists("cameroonian_french_fasttext.bin"))


In [None]:
model_std = fasttext.train_unsupervised("standard_french_sentences.txt", model="skipgram", dim=300, epoch=10)
model_std.save_model("standard_french_fasttext.bin")

model_cmr = fasttext.train_unsupervised("cameroonian_french_sentences.txt", model="skipgram", dim=300, epoch=10)
model_cmr.save_model("cameroonian_french_fasttext.bin")

print("FastText models trained and saved!")

In [None]:
def get_fasttext_embedding(text):
    words = text.split()
    word_vectors = [ft.get_word_vector(word) for word in words if word in ft]
    return np.mean(word_vectors, axis=0).tolist() if word_vectors else np.zeros(300).tolist()

ft = fasttext.load_model('standard_french_fasttext.bin')
standard_french_df["embedding"] = standard_french_df["text"].apply(get_fasttext_embedding)
ft = fasttext.load_model('cameroonian_french_fasttext.bin')
cameroonian_french_df["embedding"] = cameroonian_french_df["text"].apply(get_fasttext_embedding)

In [None]:
ft_std = fasttext.load_model("standard_french_fasttext.bin")
ft_cmr = fasttext.load_model("cameroonian_french_fasttext.bin")

word = "bonjour"
vec_std = ft_std.get_word_vector(word)
vec_cmr = ft_cmr.get_word_vector(word)

print(f"Vector for '{word}' (Standard French):", vec_std[:10])
print(f"Vector for '{word}' (Cameroonian French):", vec_cmr[:10])


In [None]:
import numpy as np

def get_sentence_vector(model, sentence):
    """Generate a dense vector representation of a sentence."""
    words = sentence.split()
    vectors = [model.get_word_vector(word) for word in words if word in model.words]

    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)

df_standard["embedding"] = df_standard["sentence"].apply(lambda x: get_sentence_vector(ft_std, str(x)))
df_cameroonian["embedding"] = df_cameroonian["sentence"].apply(lambda x: get_sentence_vector(ft_cmr, str(x)))

print("Sentence embeddings generated!")


In [None]:
df_standard.to_csv("UPDATED_standard_french_with_embeddings.csv", index=False)
df_cameroonian.to_csv("UPDATED_cameroonian_french_with_embeddings.csv", index=False)

print("Embeddings saved to CSV!")
