In [None]:
from google.colab import drive
drive.mount('/content/drive')
import spacy
import pandas as pd

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

standard_french_df = pd.read_csv('/content/drive/MyDrive/Thesis/data/UPDATED_standard_french_with_embeddings.csv')
cameroonian_french_df = pd.read_csv('/content/drive/MyDrive/Thesis/data/UPDATED_cameroonian_french_with_embeddings.csv')

standard_french_df["label"] = 0
cameroonian_french_df["label"] = 1

df = pd.concat([standard_french_df, cameroonian_french_df]).sample(frac=1, random_state=42).reset_index(drop=True)

df = df[['sentence', 'label']]
df["sentence"] = df["sentence"].astype(str).fillna("")
df["label"] = df["label"].astype(int)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["sentence"], df["label"], test_size=0.2, stratify=df["label"], random_state=42
)

VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 300
EMBEDDING_DIM = 128

tokenizer = Tokenizer(num_words=VOCAB_SIZE, lower=True)
tokenizer.fit_on_texts(train_texts)
X_train = tokenizer.texts_to_sequences(train_texts)
X_test = tokenizer.texts_to_sequences(test_texts)

X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

y_train = np.array(train_labels)
y_test = np.array(test_labels)

model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
    BatchNormalization(),
    LSTM(32, dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_test, y_test),
                    verbose=1, callbacks=[early_stopping])

y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print(f"LSTM Accuracy: {accuracy:.4f}")


In [None]:
from sklearn.metrics import classification_report

# y_pred = (model.predict(X_test) > 0.5).astype("int32")

report = classification_report(y_test, y_pred, digits=4)
print("Classification Report:\n", report)


In [None]:
import pandas as pd

standard_french_df = pd.read_csv('/content/drive/MyDrive/Thesis/data/UPDATED_standard_french_with_embeddings.csv')
cameroonian_french_df = pd.read_csv('/content/drive/MyDrive/Thesis/data/UPDATED_cameroonian_french_with_embeddings.csv')

standard_french_df["label"] = 0
cameroonian_french_df["label"] = 1

df = pd.concat([standard_french_df, cameroonian_french_df]).sample(frac=1, random_state=42).reset_index(drop=True)

df = df[['sentence', 'label']]

df["sentence"] = df["sentence"].astype(str).fillna("")

df["label"] = df["label"].astype(int)


In [None]:
import tensorflow as tf

def dataframe_to_tf_dataset(df, batch_size=32, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((df["sentence"].values, df["label"].values))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(df))
    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Convert to Keras dataset
train_dataset = dataframe_to_tf_dataset(train_df)
test_dataset = dataframe_to_tf_dataset(test_df, shuffle=False)


In [None]:
from tensorflow.keras.layers import TextVectorization

VOCAB_SIZE = 20000
MAX_LEN = 200

vectorizer = TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=MAX_LEN)

text_ds = train_df["sentence"].astype(str).values
vectorizer.adapt(text_ds)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

model = Sequential([
    vectorizer,
    Embedding(input_dim=VOCAB_SIZE, output_dim=128, mask_zero=True),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.build(input_shape=(None, MAX_LEN))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

model = Sequential([
    vectorizer,
    Embedding(input_dim=VOCAB_SIZE, output_dim=128, mask_zero=True),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.build(input_shape=(None, MAX_LEN))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, axis=-1)
    return vectorizer(text), label


train_dataset = train_dataset.map(lambda x, y: (tf.expand_dims(x, -1), y))
test_dataset = test_dataset.map(lambda x, y: (tf.expand_dims(x, -1), y))


In [None]:
for text_batch, label_batch in train_dataset.take(1):
    print("Text batch shape:", text_batch.shape)
    print("Label batch shape:", label_batch.shape)


In [None]:
text_samples = train_df["sentence"].astype(str).values
vectorizer.adapt(text_samples)

In [None]:
history = model.fit(train_dataset, validation_data=test_dataset, epochs=10)

In [None]:
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {accuracy:.4f}")


# Logistic Regresion

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

file_path = '/content/drive/MyDrive/Thesis/data/UPDATED_standard_french_with_embeddings.csv'
standard_french_df = pd.read_csv(file_path)
file_path = '/content/drive/MyDrive/Thesis/data/UPDATED_cameroonian_french_with_embeddings.csv'
cameroonian_french_df = pd.read_csv(file_path)

standard_french_df["label"] = 0
cameroonian_french_df["label"] = 1

df = pd.concat([standard_french_df, cameroonian_french_df]).sample(frac=1, random_state=42).reset_index(drop=True)

df = df[['sentence', 'label']]
df["sentence"] = df["sentence"].astype(str).fillna("")

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["sentence"], df["label"], test_size=0.2, stratify=df["label"], random_state=42
)

vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, train_labels)

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(test_labels, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(test_labels, y_pred))