# **Gender Guesser From Ethiopian Names**

## Import libraries

In [None]:
import pandas as pd
import string
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from google.colab import files

## Upload name csv file

In [None]:
uploaded = files.upload()

Saving names-gender.csv to names-gender.csv


In [None]:
data = pd.read_csv('names-gender.csv',)

# Display the first few rows
data.head()

Unnamed: 0,Name,Gender
0,ABDURAHIM,Male
1,ABEL,Male
2,ABEL,Male
3,ABEL,Male
4,ABEL,Male


## **Extract featues**

In [None]:
# Define functions for feature extraction
def count_vowels(name):
    vowels = 'aeiou'
    return sum(1 for char in name.lower() if char in vowels)

def count_consonants(name):
    vowels = 'aeiou'
    return sum(1 for char in name.lower() if char in string.ascii_lowercase and char not in vowels)

def extract_features(name):
    name = name.strip()
    features = {
        'first_letter': name[0].lower() if len(name) > 0 else '',
        'last_letter': name[-1].lower() if len(name) > 0 else '',
        'length': len(name),
        'num_vowels': count_vowels(name),
        'num_consonants': count_consonants(name),
        'ends_with_female_suffix': 1 if name.lower().endswith(('t', 'let', 'wit', 'ltu', 'ch',"sh","yehu",)) else 0,
        'ends_with_male_suffix': 1 if name.lower().endswith(('e', 'o', 'sus','sa','neh')) else 0
    }
    return features


In [None]:
# Extract features from names
features = data['Name'].apply(extract_features)

# Convert feature dictionaries to a DataFrame
features_df = pd.DataFrame(features.tolist())

# Encode the target labels
label_map = {'Male': 0, 'Female': 1, 'Neutral': 2}
data['Gender_Code'] = data['Gender'].map(label_map).fillna(2).astype(int)

# Display the features
features_df.head()


Unnamed: 0,first_letter,last_letter,length,num_vowels,num_consonants,ends_with_female_suffix,ends_with_male_suffix
0,a,m,9,4,5,0,0
1,a,l,4,2,2,0,0
2,a,l,4,2,2,0,0
3,a,l,4,2,2,0,0
4,a,l,4,2,2,0,0


In [None]:
# Initialize DictVectorizer
vec = DictVectorizer(sparse=False)

# Fit and transform the feature dictionaries
features_vectorized = vec.fit_transform(features)


In [None]:
# Define the target variable
y = data['Gender_Code']

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(features_vectorized, y, test_size=0.2, random_state=42)


In [None]:
# Initialize the classifier
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)


## View metrics

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Create inverse label map
label_map_inverse = {v: k for k, v in label_map.items()}

# Find unique classes in y_test and y_pred
unique_labels = sorted(set(y_test) | set(y_pred))

# Create corresponding target_names
target_names = [label_map_inverse[label] for label in unique_labels]

# Generate classification report
report = classification_report(y_test, y_pred, target_names=target_names, labels=unique_labels)
print("Classification Report:")
print(report)

Model Accuracy: 0.70
Classification Report:
              precision    recall  f1-score   support

        Male       0.71      0.73      0.72     74544
      Female       0.69      0.66      0.67     65734

    accuracy                           0.70    140278
   macro avg       0.70      0.70      0.70    140278
weighted avg       0.70      0.70      0.70    140278



## **main method**

In [None]:
def guess_gender(names, model, vectorizer, label_map_inverse):
    """
    Predicts the gender for a list of names.

    :param names: List of names to predict.
    :param model: Trained scikit-learn model.
    :param vectorizer: Fitted DictVectorizer.
    :param label_map_inverse: Dictionary to map numerical labels back to gender.
    :return: Dictionary mapping names to predicted genders.
    """
    features = [extract_features(name) for name in names]
    features_vectorized = vectorizer.transform(features)
    predictions = model.predict(features_vectorized)
    predicted_genders = [label_map_inverse.get(label, "Unknown") for label in predictions]
    return dict(zip(names, predicted_genders))


In [None]:
# Create a reverse mapping from numerical labels to gender strings
label_map_inverse = {v: k for k, v in label_map.items()}


## **Test here**

In [94]:
# List of names to predict
test_names = ['danait','gebreyesus','bontu','chala','saliya','habtamua', 'faiza', 'melat','degaga','kelbessa', 'abebech','miniyahil','nardos','betty','Biniam','nazrawit','selemon']

# Predict genders
predictions = guess_gender(test_names, model, vec, label_map_inverse)

# Display the predictions
for name, gender in predictions.items():
    print(f"{name}: {gender}")


danait: Female
gebreyesus: Male
bontu: Female
chala: Male
saliya: Female
habtamua: Female
faiza: Female
melat: Female
degaga: Male
kelbessa: Male
abebech: Female
miniyahil: Male
nardos: Male
betty: Female
Biniam: Male
nazrawit: Female
selemon: Male
