In [9]:
import pandas as pd

df = pd.read_csv('../data/mm_names.csv')
df.dropna(inplace=True)
df['Name'] = df['Name'].str.lower().replace(' ', '_')
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Name'], df['Gender'], test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),  # Include unigrams and bigrams
    ('tfidf', TfidfTransformer()),  # Apply TF-IDF transformation
    ('clf', MultinomialNB(alpha=0.1))  # Adjust alpha parameter for smoothing
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict the gender for the test data
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

name = "May May"  # Replace with the name you want to predict
y_pred = pipeline.predict([name])[0]
gender = pd.Series(y_pred).map({0: 'male', 1: 'female'}).to_string().split()[1]

print("Predicted gender:", gender)

Accuracy Score: 0.48262910798122066
Predicted gender: female
