# Initialization

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Imports

In [None]:
import pandas as pd
import os
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dense, Activation, Dropout, LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam, SGD, RMSprop

import numpy as np
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.utils import to_categorical
from keras.metrics import Precision, Recall, CategoricalAccuracy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve
import pickle

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import EvalPrediction

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Constants

In [None]:
DIR = ''

DATA_DIR = DIR + 'data/'
RAW_DATA = DATA_DIR + 'FirstNames.xlsx'
CHAR_MAP_DATA = DATA_DIR + 'char_data.json'
ROBUST_DATA = DATA_DIR + 'robust_data.csv'

LOG_DIR = DIR + 'logs/'

LSTM_BINARY_LOG_DIR = LOG_DIR + 'lstm_binary/'
LSTM_MULTICLASS_LOG_DIR = LOG_DIR + 'lstm_multiclass/'
LSTM_MULTICLASS_FREQ_LOG_DIR = LOG_DIR + 'lstm_multiclass_freq/'

FABERT_BINARY_DNN_LOG_DIR = LOG_DIR + 'fabert_dnn_binary/'
FABERT_MULTICLASS_DNN_LOG_DIR = LOG_DIR + 'fabert_dnn_multiclass/'

# Load Dataset

In [None]:
df = pd.read_excel(RAW_DATA)
df

# Exploratory Data Analysis (EDA)

## Pre-Processing

In [None]:
def unique_values(df):
  unique_values = df.apply(lambda x: x.unique())
  print(unique_values)

In [None]:
unique_values(df)

### Refining

In [None]:
df.rename(
    columns={
        'Naam': 'Name',
        'Pesar': 'isMale',
        'Dokhtar': 'isFemale'
    },
    inplace=True
)

In [None]:
df['Rate'].replace({
    'بسيار نادر': 'Very Rare',
    'معمولي': 'Common',
    'پركاربرد': 'Frequently Used'
}, inplace=True)

In [None]:
def determine_gender(row):
    if row['isFemale'] == 1 and row['isMale'] == 1:
        return 'Neutral'
    elif row['isFemale'] == 1:
        return 'Female'
    elif row['isMale'] == 1:
        return 'Male'
    else:
        return 'Missing'

In [None]:
df.fillna(0, inplace=True)
df['Gender'] = df.apply(determine_gender, axis=1)

In [None]:
df.drop(columns=['isMale', 'isFemale'], inplace=True)

In [None]:
unique_values(df)

In [None]:
df

### Text Refinement

In [None]:
# Get the unique characters in the names
vocab_chars = sorted(list(set(''.join(df['Name']))))
len(vocab_chars)

#### Char Mapping Table

In [None]:
# with open(CHAR_MAP_DATA, 'w', encoding='utf-8') as f:
#     json.dump({'valid_chars': valid_chars, 'char_mappings': char_mappings}, f, ensure_ascii=False, indent=4)

In [None]:
with open(CHAR_MAP_DATA, 'r', encoding='utf-8') as f:
    data = json.load(f)

valid_chars = data['valid_chars']
char_mappings = data['char_mappings']

translation_table = dict((ord(a), b) for a, b in char_mappings.items())

#### Handling Whitespaces

In [None]:
def remove_zero_width_characters(text: str) -> str:
    text = text.replace("\u200c", "")
    text = text.replace("\u200b", "")
    text = text.replace("\ufe0f", "")
    text = text.replace("\ufeff", "")
    return text

def replace_consecutive_whitespace(text: str) -> str:
    return re.sub(r"\s+", " ", text)

#### Refinement Pipline

In [None]:
def refine_name(text: str) -> str:
    if pd.isna(text):
        return text

    text = remove_zero_width_characters(text)
    text = text.translate(translation_table)
    text = replace_consecutive_whitespace(text)
    return text.strip()

In [None]:
df['Name'] = df['Name'].apply(refine_name)

In [None]:
translated_chars = sorted(list(set(''.join(df['Name']))))
len(translated_chars)

In [None]:
print(translated_chars)

### Duplicate Data

#### Exploring Duplicates

In [None]:
# Duplicate names with the same attributes
duplicates_same_attributes = df[df.duplicated(subset=['Name', 'Rate', 'Gender'], keep=False)]
duplicates_same_attributes.head()

In [None]:
# Duplicates with different Gender but same Name and Rate
diff_gender = df.groupby(['Name', 'Rate']).filter(lambda x: x['Gender'].nunique() > 1)
diff_gender.head()

In [None]:
# Duplicates with different Rate but same Name and Gender
diff_rate = df.groupby(['Name', 'Gender']).filter(lambda x: x['Rate'].nunique() > 1)
diff_rate.head()

In [None]:
# Duplicates where only the Name is the same, but both Rate and Gender are different
diff_name_only = df.groupby('Name').filter(
    lambda x: (x['Rate'].nunique() > 1) & (x['Gender'].nunique() > 1)
)
diff_name_only.head()

#### Handling Duplicates

In [None]:
rate_priority = {
    'Frequently Used': 3,
    'Common': 2,
    'Very Rare': 1
}

In [None]:
df['Rate_Priority'] = df['Rate'].map(rate_priority)

df_sorted = df.sort_values(by=['Name', 'Rate_Priority'], ascending=[True, False])

# If there's multiple values for gender, then select 'Neutral'
def select_gender(group):
  if group['Gender'].nunique() > 1:
    group['Gender'] = 'Neutral'
  return group

df_final = df_sorted.groupby('Name').apply(select_gender)

# Drop duplicates, keeping the first (which is the highest priority due to sorting)
df_final_unique = df_final.drop_duplicates(subset='Name', keep='first')

df_final_unique = df_final_unique.drop(columns=['Rate_Priority'])
df_final_unique.reset_index(drop=True, inplace=True)

In [None]:
df[df['Name']=='عرفان']

In [None]:
df_final_unique[df_final_unique['Name']=='عرفان']

In [None]:
print(df.shape)
print(df_final_unique.shape)

In [None]:
df = df_final_unique

In [None]:
df.to_csv(ROBUST_DATA, index=False)

## Distributions

In [None]:
def plot_distribution(df: pd.DataFrame, column: str, palette='viridis', title=None, xlabel=None, ylabel='Count'):
    plt.figure(figsize=(8, 6))
    ax = sns.countplot(data=df, x=column, palette=palette, hue=column, legend=False)

    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width() / 2., height + 0.1, int(height), ha="center", va="bottom")

    if title:
        plt.title(title)
    if xlabel:
        plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

In [None]:
plot_distribution(df, 'Gender', title='Gender Distribution', xlabel='Gender')

In [None]:
plot_distribution(df, 'Rate', title='Rate Distribution', xlabel='Rate')

In [None]:
def plot_combined_distribution(df: pd.DataFrame, x_column: str, hue_column: str, palette='viridis', title=None, xlabel=None, ylabel='Count'):
    plt.figure(figsize=(10, 6))
    ax = sns.countplot(data=df, x=x_column, hue=hue_column, palette=palette)

    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width() / 2., height + 0.1, int(height), ha="center", va="bottom")

    if title:
        plt.title(title)
    if xlabel:
        plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

In [None]:
plot_combined_distribution(df, 'Gender', 'Rate', title='Gender and Rate Distribution', xlabel='Gender')

In [None]:
df['NameLength'] = df['Name'].apply(len)
length_counts = df['NameLength'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
bars = plt.bar(length_counts.index, length_counts.values)

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, int(yval), ha='center', va='bottom')

plt.xticks(length_counts.index)
plt.xlabel('Length of Name')
plt.ylabel('Frequency')
plt.title('Frequency of Name Lengths')

plt.show()

# Load Dataset

In [None]:
df = pd.read_csv(ROBUST_DATA)
df

# Models

## Load Model

In [None]:
from keras.models import load_model

model = load_model(os.path.join(LSTM_BINARY_LOG_DIR, 'best_model.keras'), safe_mode=False)

## 1. Character-level Bi-LSTM Models

In [None]:
def encode_data(df, max_len=20):
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['Gender'])

    char_set = sorted(set(''.join(df['Name'])))
    char_to_int = {c: i + 1 for i, c in enumerate(char_set)}  # +1 for padding

    X = np.array([[char_to_int.get(char, 0) for char in name] + [0] * (max_len - len(name)) for name in df['Name']])

    return X, y, label_encoder, char_to_int

In [None]:
def split_data(df, split_frequently_used=False):
    very_rare = df[df['Rate'] == 'Very Rare']
    common = df[df['Rate'] == 'Common']
    frequently_used = df[df['Rate'] == 'Frequently Used']

    train_very_rare, test_very_rare = train_test_split(very_rare, test_size=0.2, random_state=42)
    train_common, test_common = train_test_split(common, test_size=0.2, random_state=42)

    if split_frequently_used:
        train_frequently_used, test_frequently_used = train_test_split(frequently_used, test_size=0.2, random_state=42)
        return train_very_rare, test_very_rare, train_common, test_common, train_frequently_used, test_frequently_used
    else:
        return train_very_rare, test_very_rare, train_common, test_common, frequently_used

In [None]:
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = Precision(name='precision')
        self.recall = Recall(name='recall')

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        precision = self.precision.result()
        recall = self.recall.result()
        return 2 * ((precision * recall) / (precision + recall + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()

In [None]:
def build_model(input_dim, output_dim, max_len, is_binary=True):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=32, input_length=max_len))
    model.add(Bidirectional(LSTM(64, return_sequences=True), backward_layer=LSTM(64, return_sequences=True, go_backwards=True)))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu', activity_regularizer=l2(0.002)))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='relu', activity_regularizer=l2(0.002)))
    model.add(Dropout(0.2))
    model.add(Dense(1 if is_binary else output_dim, activation='sigmoid' if is_binary else 'softmax', activity_regularizer=l2(0.002)))

    metrics = (['accuracy', Precision(name='precision'), Recall(name='recall'), F1Score()]
           if is_binary else
           [CategoricalAccuracy(name='accuracy'), Precision(name='precision'), Recall(name='recall'), F1Score()])

    # lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    # initial_learning_rate=1e-2,
    # decay_steps=10000,
    # decay_rate=0.9)
    # opt = keras.optimizers.Adam(learning_rate=lr_schedule)

    model.compile(
        loss='binary_crossentropy' if is_binary else 'categorical_crossentropy',
        optimizer='adam',
        # optimizer=opt,
        metrics=metrics
        # metrics=['accuracy', Precision(name='precision'), Recall(name='recall')] if is_binary else [CategoricalAccuracy(name='accuracy'), Precision(name='precision'), Recall(name='recall')]
    )

    return model

In [None]:
def train_model(model, X_train, y_train, X_test, y_test, model_path, epochs=50, batch_size=32):
    mc = ModelCheckpoint(model_path, monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
    reduce_lr_acc = ReduceLROnPlateau(monitor='val_f1_score', factor=0.7, patience=10, verbose=1, min_delta=1e-4, mode='max')

    callbacks = [mc, reduce_lr_acc]

    return model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size, callbacks=callbacks)

In [None]:
def save_model_plot(model, filename='model_architecture.png'):
    plt_path = os.path.join(LOG_FOLDER, filename)
    plot_model(model, to_file=plt_path, show_shapes=True, show_layer_names=True)
    print(f"Model architecture saved to {plt_path}")

In [None]:
def plot_metrics(history, metrics, title=None):
    plt.figure(figsize=(15, 10))

    for i, metric in enumerate(metrics):
        plt.subplot(2, 2, i + 1)

        metric_values = history.history.get(metric, [])
        val_metric_values = history.history.get(f'val_{metric}', [])

        if metric_values:
            plt.plot(metric_values, label=f'Train {metric}')

        if val_metric_values:
            plt.plot(val_metric_values, label=f'Val {metric}')

        plt.title(f'{metric.capitalize()} Over Epochs')
        plt.xlabel('Epochs')
        plt.ylabel(metric.capitalize())
        plt.legend()
        plt.grid()

    plt.tight_layout()

    if title:
        plt.suptitle(title, fontsize=16)
        plt.subplots_adjust(top=0.9)

    plt_path = os.path.join(LOG_FOLDER, 'training_metrics.png')
    plt.savefig(plt_path)
    plt.show()

def predict_and_evaluate(model, new_name, char_to_int, max_len, label_encoder):
    new_X = np.array([[char_to_int.get(char, 0) for char in new_name] + [0] * (max_len - len(new_name))])
    prediction = model.predict(new_X)
    print(f"Model Prediction: {prediction}")
    if len(prediction[0]) > 1:  # Multi-class classification
        predicted_class_index = np.argmax(prediction, axis=1)
        predicted_gender = label_encoder.inverse_transform(predicted_class_index)[0]
    else:  # Binary classification
        if prediction > 0.65:
            predicted_gender = 'Male'
        elif prediction < 0.35:
            predicted_gender = 'Female'
        else:
            predicted_gender = 'Neutral'

    print(f'The predicted gender for {new_name} is {predicted_gender}')

def test_random_samples(data_split, split_name, char_to_int, model, label_encoder, max_len):
    print(f"\nTesting 5 random samples from {split_name}:\n")
    sample_data = data_split.sample(5)

    for index, row in sample_data.iterrows():
        name = row['Name']
        actual_gender = row['Gender']
        encoded_name = np.array([[char_to_int.get(char, 0) for char in name] + [0] * (max_len - len(name))])

        prediction = model.predict(encoded_name, verbose=0)

        if len(prediction[0]) > 1:  # Multi-class classification
            predicted_class_index = np.argmax(prediction, axis=1)
            predicted_gender = label_encoder.inverse_transform(predicted_class_index)[0]
        else:  # Binary classification
            if prediction > 0.65:
                predicted_gender = 'Male'
            elif prediction < 0.35:
                predicted_gender = 'Female'
            else:
                predicted_gender = 'Neutral'

        actual_gender_decoded = label_encoder.inverse_transform([label_encoder.transform([actual_gender])[0]])[0]

        print(f"Name: {name}, Actual: {actual_gender_decoded}, Predicted: {predicted_gender}")

def evaluate_and_plot_accuracy(data_splits, split_names, char_to_int, model, label_encoder, max_len):
    results = {}

    for data_split, split_name in zip(data_splits, split_names):
        X_split = np.array([[char_to_int.get(char, 0) for char in name] + [0] * (max_len - len(name)) for name in data_split['Name']])
        y_true = label_encoder.transform(data_split['Gender'])

        predictions = model.predict(X_split, verbose=0)
        predicted_class_indices = np.argmax(predictions, axis=1) if len(predictions[0]) > 1 else (predictions > 0.5).astype(int).flatten()

        correct_predictions = (predicted_class_indices == y_true).sum()
        wrong_predictions = (predicted_class_indices != y_true).sum()

        results[split_name] = {
            'correct': correct_predictions,
            'wrong': wrong_predictions,
            'accuracy': correct_predictions / (correct_predictions + wrong_predictions)
        }

    plot_accuracy_results(results)

def plot_accuracy_results(results):
    split_names = list(results.keys())
    correct_counts = [results[split]['correct'] for split in split_names]
    wrong_counts = [results[split]['wrong'] for split in split_names]
    accuracies = [results[split]['accuracy'] for split in split_names]

    x = np.arange(len(split_names))
    width = 0.35

    fig, ax = plt.subplots(figsize=(12, 7))

    bars_correct = ax.bar(x - width/2, correct_counts, width, label='Correct', color='g')
    bars_wrong = ax.bar(x + width/2, wrong_counts, width, label='Wrong', color='r')

    for i, accuracy in enumerate(accuracies):
        ax.text(x[i], max(correct_counts[i], wrong_counts[i]) + 0.5, f'Acc: {accuracy:.2f}', ha='center', fontsize=12)

    ax.set_xlabel('Rate Category')
    ax.set_ylabel('Number of Predictions')
    ax.set_title('Correct and Wrong Predictions by Rate')
    ax.set_xticks(x)
    ax.set_xticklabels(split_names)
    ax.legend()

    plt.tight_layout()
    plt_path = os.path.join(LOG_FOLDER, 'accuracy_results.png')
    plt.savefig(plt_path)
    plt.show()

def evaluate_confusion_matrix(data_splits, split_names, char_to_int, model, label_encoder, max_len):
    all_y_true = []
    all_y_pred = []

    for data_split, split_name in zip(data_splits, split_names):
        X_split = np.array([[char_to_int.get(char, 0) for char in name] + [0] * (max_len - len(name)) for name in data_split['Name']])
        y_true = label_encoder.transform(data_split['Gender'])

        predictions = model.predict(X_split, verbose=0)
        predicted_class_indices = np.argmax(predictions, axis=1) if len(predictions[0]) > 1 else (predictions > 0.5).astype(int).flatten()

        all_y_true.extend(y_true)
        all_y_pred.extend(predicted_class_indices)

    conf_matrix = confusion_matrix(all_y_true, all_y_pred)
    conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)

    plot_confusion_matrix(conf_matrix_df)

def plot_confusion_matrix(conf_matrix_df):
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix_df, annot=True, cmap='Blues', fmt='g')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt_path = os.path.join(LOG_FOLDER, 'confusion_matrix.png')
    plt.savefig(plt_path)
    plt.show()

### a. Binary Classification

#### Train

In [None]:
LOG_FOLDER = LSTM_BINARY_LOG_DIR

os.makedirs(LOG_FOLDER, exist_ok=True)

In [None]:
df = df[df['Gender'] != 'Neutral']

max_len = 20
X, y, label_encoder, char_to_int = encode_data(df, max_len)
is_binary = len(np.unique(y)) == 2

# Data Splitting
train_very_rare, test_very_rare, train_common, test_common, train_frequently_used, test_frequently_used = split_data(df, split_frequently_used=True)

# Combine Train and Test Sets
train_data = pd.concat([train_very_rare, train_common, train_frequently_used])
test_data = pd.concat([test_very_rare, test_common, test_frequently_used])

# Prepare Training and Testing Data
X_train = np.array([[char_to_int[char] for char in name] + [0] * (max_len - len(name)) for name in train_data['Name']])
y_train = to_categorical(label_encoder.transform(train_data['Gender'])) if not is_binary else label_encoder.transform(train_data['Gender'])
X_test = np.array([[char_to_int[char] for char in name] + [0] * (max_len - len(name)) for name in test_data['Name']])
y_test = to_categorical(label_encoder.transform(test_data['Gender'])) if not is_binary else label_encoder.transform(test_data['Gender'])

# Build and Train Model
input_dim = len(char_to_int) + 1
output_dim = len(label_encoder.classes_)
model = build_model(input_dim, output_dim, max_len, is_binary)
history = train_model(model, X_train, y_train, X_test, y_test, model_path=LOG_FOLDER + 'best_model.keras', epochs=100, batch_size=150)

with open(os.path.join(LOG_FOLDER, 'model.history'), 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

In [None]:
save_model_plot(model)

#### Evaluation

In [None]:
plot_metrics(history, ['accuracy', 'precision', 'recall', 'f1_score'], title='Training and Validation Metrics')

In [None]:
data_splits = [train_very_rare, test_very_rare, train_common, test_common, train_frequently_used, test_frequently_used]
split_names = ["Train Very Rare", "Test Very Rare", "Train Common", "Test Common", "Train Frequently Used", "Test Frequently Used"]

In [None]:
evaluate_and_plot_accuracy(data_splits, split_names, char_to_int, model, label_encoder, max_len)

In [None]:
evaluate_confusion_matrix(data_splits, split_names, char_to_int, model, label_encoder, max_len)

#### Test

In [None]:
test_random_samples(test_common, "Train Common", char_to_int, model, label_encoder, max_len)

In [None]:
predict_and_evaluate(model, 'آرمین', char_to_int, max_len, label_encoder)

### b. Multi-Class Classification


#### Train

In [None]:
LOG_FOLDER = LSTM_MULTICLASS_LOG_DIR

os.makedirs(LOG_FOLDER, exist_ok=True)

In [None]:
max_len = 20
X, y, label_encoder, char_to_int = encode_data(df, max_len)
is_binary = len(np.unique(y)) == 2

# Data Splitting
train_very_rare, test_very_rare, train_common, test_common, train_frequently_used, test_frequently_used = split_data(df, split_frequently_used=True)

# Combine Train and Test Sets
train_data = pd.concat([train_very_rare, train_common, train_frequently_used])
test_data = pd.concat([test_very_rare, test_common, test_frequently_used])

# Prepare Training and Testing Data
X_train = np.array([[char_to_int[char] for char in name] + [0] * (max_len - len(name)) for name in train_data['Name']])
y_train = to_categorical(label_encoder.transform(train_data['Gender'])) if not is_binary else label_encoder.transform(train_data['Gender'])
X_test = np.array([[char_to_int[char] for char in name] + [0] * (max_len - len(name)) for name in test_data['Name']])
y_test = to_categorical(label_encoder.transform(test_data['Gender'])) if not is_binary else label_encoder.transform(test_data['Gender'])

# Build and Train Model
input_dim = len(char_to_int) + 1
output_dim = len(label_encoder.classes_)
model = build_model(input_dim, output_dim, max_len, is_binary)
history = train_model(model, X_train, y_train, X_test, y_test, model_path=LOG_FOLDER + 'best_model.keras', epochs=100, batch_size=100)

with open(os.path.join(LOG_FOLDER, 'model.history'), 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

In [None]:
save_model_plot(model)

#### Evaluation

In [None]:
plot_metrics(history, ['accuracy', 'precision', 'recall', 'f1_score'], title='Training and Validation Metrics')

In [None]:
data_splits = [train_very_rare, test_very_rare, train_common, test_common, train_frequently_used, test_frequently_used]
split_names = ["Train Very Rare", "Test Very Rare", "Train Common", "Test Common", "Train Frequently Used", "Test Frequently Used"]

In [None]:
evaluate_and_plot_accuracy(data_splits, split_names, char_to_int, model, label_encoder, max_len)

In [None]:
evaluate_confusion_matrix(data_splits, split_names, char_to_int, model, label_encoder, max_len)

#### Test

In [None]:
test_random_samples(test_common, "Test Common", char_to_int, model, label_encoder, max_len)

In [None]:
predict_and_evaluate(model, 'رضا', char_to_int, max_len, label_encoder)

## Hazm Based Model

### Installations

In [None]:
!sudo apt-get install -y asciidoc megatools

In [None]:
%cd '/content/drive/MyDrive/Work/Utils'

In [None]:
!curl 'https://megatools.megous.com/builds/megatools-1.10.3.tar.gz' | tar xz

In [None]:
%cd /content/drive/MyDrive/Work/Utils/megatools-1.10.3/
!./configure; make; make install

In [None]:
!megadl --path '/content/drive/MyDrive/Work/Utils' 'https://mega.nz/file/GqZUlbpS#XRYP5FHbPK2LnLZ8IExrhrw3ZQ-jclNSVCz59uEhrxY'

In [None]:
%cd '/content/drive/MyDrive/Work/Utils'
!unzip fasttext_model.zip -d '/content/drive/MyDrive/Work/Utils'

In [None]:
!ls

## FaBERT Based Model

#### ML Models

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("sbunlp/fabert")
model = AutoModelForMaskedLM.from_pretrained("sbunlp/fabert")

# Name of the person
name = "علی"

# Tokenize the name
tokens = tokenizer(name, return_tensors='pt')

print(tokens)

# Get BERT embeddings
with torch.no_grad():
    outputs = model(**tokens, output_hidden_states=True)

# Extract the hidden states from the outputs
hidden_states = outputs.hidden_states

# The last hidden state (the last layer's embeddings)
embeddings = hidden_states[-1]

# Average the embeddings of all tokens to get a single vector for the name
name_embedding = embeddings.mean(dim=1).squeeze()

print(name_embedding)

In [None]:
print(len(name_embedding))

In [None]:
# Filter out neutral gender entries and select a subset of names
df = df[df['Gender'] != 'Neutral']
names = df['Name'].tolist()
genders = df['Gender'].tolist()

# Function to generate embeddings
def get_name_embedding(name):
    tokens = tokenizer(name, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens, output_hidden_states=True)
    hidden_states = outputs.hidden_states
    embeddings = hidden_states[-1]
    name_embedding = embeddings.mean(dim=1).squeeze().numpy()
    return name_embedding

# Use tqdm to show progress while generating embeddings
embeddings = []
for name in tqdm(names, desc="Generating embeddings"):
    embedding = get_name_embedding(name)
    embeddings.append(embedding)

# Convert gender labels to numeric
label_mapping = {'Male': 0, 'Female': 1}  # Adjust according to your dataset
genders = [label_mapping[gender] for gender in genders]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, genders, test_size=0.2, random_state=42)

# Train a Logistic Regression classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\nAccuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

In [None]:
def predict_gender(name):
    name_embedding = get_name_embedding(name)
    gender_prediction = clf.predict([name_embedding])[0]
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    return reverse_label_mapping.get(gender_prediction, "Unknown")

input_name = "ایمانه"
predicted_gender = predict_gender(input_name)
print(f"The predicted gender for the name '{input_name}' is: {predicted_gender}")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

In [None]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, genders, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42)
}

for name, clf in classifiers.items():
    print(f"\nTraining {name}...")
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"{name} Accuracy: {accuracy:.4f}")
    print(f"{name} Classification Report:")
    print(report)

In [None]:
from sklearn.ensemble import VotingClassifier

classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42)
}

voting_clf = VotingClassifier(estimators=[(name, clf) for name, clf in classifiers.items()], voting='soft')
voting_clf.fit(X_train, y_train)

y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Voting Classifier Accuracy: {accuracy:.4f}")
print("Voting Classifier Classification Report:")
print(report)


#### DNN Model

In [None]:
def encode_data(df, max_len=20):
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['Gender'])

    char_set = sorted(set(''.join(df['Name'])))
    char_to_int = {c: i + 1 for i, c in enumerate(char_set)}  # +1 for padding

    X = np.array([[char_to_int.get(char, 0) for char in name] + [0] * (max_len - len(name)) for name in df['Name']])

    return X, y, label_encoder, char_to_int

def build_bert_model(num_labels=2):
    model = AutoModelForSequenceClassification.from_pretrained("sbunlp/fabert", num_labels=num_labels)
    return model

def compute_metrics(eval_pred: EvalPrediction):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)  # Get the predicted class labels
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def train_model_with_train_metrics(
    model,
    train_dataset,
    val_dataset,
    tokenizer,
    model_path,
    log_folder,
    epochs=100,
    batch_size=100,
):
    os.makedirs(log_folder, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=log_folder,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy="epoch",
        logging_dir=os.path.join(log_folder, "logs"),
        logging_steps=1,
        save_total_limit=1,
        load_best_model_at_end=False,
        save_strategy="no",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    train_outputs = trainer.predict(train_dataset)
    train_accuracy, train_precision, train_recall, train_f1 = compute_metrics(
        EvalPrediction(predictions=train_outputs.predictions, label_ids=train_outputs.label_ids)
    ).values()

    train_metrics_history = [
        {
            "epoch": epoch + 1,
            "train_accuracy": train_accuracy,
            "train_precision": train_precision,
            "train_recall": train_recall,
            "train_f1": train_f1,
        }
        for epoch in range(epochs)
    ]

    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

    with open(os.path.join(log_folder, "model.history"), "wb") as file_pi:
        pickle.dump(trainer.state.log_history, file_pi)

    print(f"Training history saved in {log_folder}")

    return trainer, trainer.state.log_history, train_metrics_history

In [None]:
LOG_FOLDER = FABERT_BINARY_DNN_LOG_DIR

os.makedirs(LOG_FOLDER, exist_ok=True)

df = df[df['Gender'] != 'Neutral']

In [None]:
max_len = 20
X, y, label_encoder, char_to_int = encode_data(df, max_len)
is_binary = len(np.unique(y)) == 2

train_very_rare, test_very_rare, train_common, test_common, train_frequently_used, test_frequently_used = split_data(df, split_frequently_used=True)

# Combine Train and Test Sets
train_data = pd.concat([train_very_rare, train_common, train_frequently_used])
test_data = pd.concat([test_very_rare, test_common, test_frequently_used])

# Load the tokenizer and BERT model
tokenizer = AutoTokenizer.from_pretrained("sbunlp/fabert")
model = AutoModelForSequenceClassification.from_pretrained("sbunlp/fabert", num_labels=2 if is_binary else len(label_encoder.classes_))

# Prepare training and testing data using BERT tokenizer
train_encodings = tokenizer(train_data['Name'].tolist(), truncation=True, padding=True, max_length=max_len, return_tensors='pt')
test_encodings = tokenizer(test_data['Name'].tolist(), truncation=True, padding=True, max_length=max_len, return_tensors='pt')

X_train = train_encodings['input_ids']
attention_mask_train = train_encodings['attention_mask']
y_train = torch.tensor(label_encoder.transform(train_data['Gender']))

X_test = test_encodings['input_ids']
attention_mask_test = test_encodings['attention_mask']
y_test = torch.tensor(label_encoder.transform(test_data['Gender']))

In [None]:
class GenderDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the datasets
train_dataset = GenderDataset(train_encodings, y_train)
val_dataset = GenderDataset(test_encodings, y_test)

In [None]:
trainer, history = train_model_with_train_metrics(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    tokenizer=tokenizer,
    model_path=os.path.join(LOG_FOLDER, 'best_model'),
    log_folder=LOG_FOLDER,
    epochs=5,
    batch_size=256
)

In [None]:
def predict_and_evaluate(model, tokenizer, new_name, label_encoder, max_len=128):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    encodings = tokenizer([new_name], truncation=True, padding=True, max_length=max_len, return_tensors='pt')
    new_X = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    with torch.no_grad():
        prediction = model(new_X, attention_mask=attention_mask).logits
    predicted_class_index = torch.argmax(prediction, dim=1).item()

    predicted_gender = label_encoder.inverse_transform([predicted_class_index])[0]
    print(f'The predicted gender for {new_name} is {predicted_gender}')

In [None]:
def plot_metrics(history, train_metrics_history):
    train_loss = []
    eval_loss = []
    eval_accuracy = []
    eval_precision = []
    eval_recall = []
    eval_f1 = []

    train_accuracy_values = []
    train_precision_values = []
    train_recall_values = []
    train_f1_values = []

    for log in history:
        if 'loss' in log and 'eval_loss' not in log:
            train_loss.append((log['epoch'], log['loss']))

        if 'eval_loss' in log:
            eval_loss.append((log['epoch'], log['eval_loss']))
            eval_accuracy.append((log['epoch'], log['eval_accuracy']))
            eval_precision.append((log['epoch'], log['eval_precision']))
            eval_recall.append((log['epoch'], log['eval_recall']))
            eval_f1.append((log['epoch'], log['eval_f1']))

    for metrics in train_metrics_history:
        train_accuracy_values.append((metrics['epoch'], metrics['train_accuracy']))
        train_precision_values.append((metrics['epoch'], metrics['train_precision']))
        train_recall_values.append((metrics['epoch'], metrics['train_recall']))
        train_f1_values.append((metrics['epoch'], metrics['train_f1']))

    train_epochs, train_loss_values = zip(*train_loss) if train_loss else ([], [])
    eval_epochs, eval_loss_values = zip(*eval_loss) if eval_loss else ([], [])
    _, eval_accuracy_values = zip(*eval_accuracy) if eval_accuracy else ([], [])
    _, eval_precision_values = zip(*eval_precision) if eval_precision else ([], [])
    _, eval_recall_values = zip(*eval_recall) if eval_recall else ([], [])
    _, eval_f1_values = zip(*eval_f1) if eval_f1 else ([], [])

    train_epochs_accuracy, train_accuracy_values = zip(*train_accuracy_values) if train_accuracy_values else ([], [])
    train_epochs_precision, train_precision_values = zip(*train_precision_values) if train_precision_values else ([], [])
    train_epochs_recall, train_recall_values = zip(*train_recall_values) if train_recall_values else ([], [])
    train_epochs_f1, train_f1_values = zip(*train_f1_values) if train_f1_values else ([], [])

    # Plot the metrics
    plt.figure(figsize=(18, 10))

    plt.subplot(2, 2, 1)
    plt.plot(train_epochs, train_loss_values, label='Train Loss', marker='o')
    plt.plot(eval_epochs, eval_loss_values, label='Eval Loss', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Evaluation Loss')

    plt.subplot(2, 2, 2)
    plt.plot(eval_epochs, eval_accuracy_values, label='Eval Accuracy', marker='o')
    plt.plot(train_epochs_accuracy, train_accuracy_values, label='Train Accuracy', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Train and Evaluation Accuracy')

    plt.subplot(2, 2, 3)
    plt.plot(eval_epochs, eval_precision_values, label='Eval Precision', marker='o')
    plt.plot(train_epochs_precision, train_precision_values, label='Train Precision', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Precision')
    plt.legend()
    plt.title('Train and Evaluation Precision')

    plt.subplot(2, 2, 4)
    plt.plot(eval_epochs, eval_recall_values, label='Eval Recall', marker='o')
    plt.plot(train_epochs_recall, train_recall_values, label='Train Recall', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Recall')
    plt.legend()
    plt.title('Train and Evaluation Recall')

    plt.tight_layout()
    plt.show()

    # Plot F1 scores separately
    plt.figure(figsize=(8, 5))
    plt.plot(eval_epochs, eval_f1_values, label='Eval F1 Score', marker='o')
    plt.plot(train_epochs_f1, train_f1_values, label='Train F1 Score', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('F1 Score')
    plt.legend()
    plt.title('Train and Evaluation F1 Score')
    plt.show()


In [None]:
plot_metrics(history, train_metrics_history)

In [None]:
new_name = "شهاب"
predict_and_evaluate(model, tokenizer, new_name, label_encoder)