In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

In [15]:
df = pd.read_csv("../data/name_gender.csv")

In [16]:
df.head()

Unnamed: 0,name,gender,prob
0,Aaban,M,1.0
1,Aabha,F,1.0
2,Aabid,M,1.0
3,Aabriella,F,1.0
4,Aada,F,1.0


In [24]:
features = df["name"]
labels = df["gender"]

In [25]:
# Preprocess and split your data
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42
)
print(
    f"X_train shape: {X_train.shape}\nX_test shape: {X_test.shape}\ny_train shape: {y_train.shape}\ny_test shape: {y_test.shape}"
)

X_train shape: (76020,)
X_test shape: (19005,)
y_train shape: (76020,)
y_test shape: (19005,)


In [26]:
# Encode labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [27]:
# Feature extraction
vectorizer = CountVectorizer(analyzer="char", ngram_range=(1, 3))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [28]:
# Initialize models
models = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "SVR": SVC(kernel="linear", probability=True),  # Since we're doing classification
    "Gradient Boosting": GradientBoostingClassifier(),
}

In [29]:
# Check for shape mismatch
assert X_train_vectorized.shape[0] == len(y_train_encoded), "Mismatched sample sizes"

In [30]:
# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_vectorized, y_train_encoded)
    scores = cross_val_score(model, X_test_vectorized, y_test_encoded, cv=5)
    print(f"{name} Accuracy: {scores.mean()}")

Naive Bayes Accuracy: 0.7591160220994475
Random Forest Accuracy: 0.827466456195738
SVR Accuracy: 0.8635096027361222
Gradient Boosting Accuracy: 0.7675874769797422


In [31]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for each model
param_grid_nb = {"alpha": [0.01, 0.1, 1, 10, 100]}

param_grid_rf = {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
}

param_grid_svc = {
    "C": [0.1, 1, 10, 100],
    "gamma": ["scale", "auto"],
    "kernel": ["linear", "rbf", "poly"],
}

param_grid_gb = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 4, 5],
}

# Initialize RandomizedSearchCV for each model
random_search_cv_nb = RandomizedSearchCV(
    MultinomialNB(), param_grid_nb, cv=5, scoring="accuracy"
)
random_search_cv_rf = RandomizedSearchCV(
    RandomForestClassifier(), param_grid_rf, cv=5, scoring="accuracy"
)
random_search_cv_svc = RandomizedSearchCV(
    SVC(), param_grid_svc, cv=5, scoring="accuracy"
)
random_search_cv_gb = RandomizedSearchCV(
    GradientBoostingClassifier(), param_grid_gb, cv=5, scoring="accuracy"
)

# Perform hyperparameter tuning and fit the models
random_search_cv_nb.fit(X_train_vectorized, y_train_encoded)
random_search_cv_rf.fit(X_train_vectorized, y_train_encoded)
random_search_cv_svc.fit(X_train_vectorized, y_train_encoded)
random_search_cv_gb.fit(X_train_vectorized, y_train_encoded)

# Get the best parameters and the best score for each model
best_params_nb = random_search_cv_nb.best_params_
best_score_nb = random_search_cv_nb.best_score_

best_params_rf = random_search_cv_rf.best_params_
best_score_rf = random_search_cv_rf.best_score_

best_params_svc = random_search_cv_svc.best_params_
best_score_svc = random_search_cv_svc.best_score_

best_params_gb = random_search_cv_gb.best_params_
best_score_gb = random_search_cv_gb.best_score_

# Print the best parameters and the best score for each model
print(f"Naive Bayes Best Params: {best_params_nb}, Best Score: {best_score_nb}")
print(f"Random Forest Best Params: {best_params_rf}, Best Score: {best_score_rf}")
print(f"SVC Best Params: {best_params_svc}, Best Score: {best_score_svc}")
print(f"Gradient Boosting Best Params: {best_params_gb}, Best Score: {best_score_gb}")

# Use the best estimator for further predictions
best_model_nb = random_search_cv_nb.best_estimator_
best_model_rf = random_search_cv_rf.best_estimator_
best_model_svc = random_search_cv_svc.best_estimator_
best_model_gb = random_search_cv_gb.best_estimator_

5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "d:\DEV WORK\Data Science Library\ML-For-Beginners\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\DEV WORK\Data Science Library\ML-For-Beginners\.venv\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "d:\DEV WORK\Data Science Library\ML-For-Beginners\.venv\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "d:\DEV WORK\Data Science Library\ML-For-Beginners\.venv\li