In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
df = pd.read_csv("../data/name_gender.csv")

In [4]:
df.head()

Unnamed: 0,name,gender,prob
0,Aaban,M,1.0
1,Aabha,F,1.0
2,Aabid,M,1.0
3,Aabriella,F,1.0
4,Aada,F,1.0


In [5]:
features = df["name"]
labels = df["gender"]

In [6]:
# Preprocess and split your data
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42
)
print(
    f"X_train shape: {X_train.shape}\nX_test shape: {X_test.shape}\ny_train shape: {y_train.shape}\ny_test shape: {y_test.shape}"
)

X_train shape: (76020,)
X_test shape: (19005,)
y_train shape: (76020,)
y_test shape: (19005,)


In [7]:
# Encode labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [8]:
# Feature extraction
vectorizer = CountVectorizer(analyzer="char", ngram_range=(1, 3))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [9]:
# Initialize models
models = {
    # "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    # "SVR": SVC(kernel="linear", probability=True),  # Since we're doing classification
    # "Gradient Boosting": GradientBoostingClassifier(),
}

In [10]:
# Check for shape mismatch
assert X_train_vectorized.shape[0] == len(y_train_encoded), "Mismatched sample sizes"

In [11]:
# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_vectorized, y_train_encoded)
    scores = cross_val_score(model, X_test_vectorized, y_test_encoded, cv=5)
    print(f"{name} Accuracy: {scores.mean()}")

Random Forest Accuracy: 0.8242567745330176


In [15]:
import pickle

# Assuming 'model' is your trained model
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)


In [16]:
with open('model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


In [27]:
data = pd.read_csv("../data/dataset.csv")
data.shape

(35001, 32)

In [28]:
data['gender'].value_counts()

gender
Male       18095
Female     16906
Name: count, dtype: int64

In [29]:
# rename Male to M and Female to F
data.columns

Index(['user_id', 'age', 'education', 'gender', 'name', 'country', 'music',
       'artist_name', 'featured_artists', 'genre', 'plays',
       'artiste_popularity', 'audio_popularity', 'music_acousticness',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre', 'release_date', 'explicit', 'duration', 'music_id',
       'id_artists', 'followers'],
      dtype='object')

In [30]:
predictions = loaded_model.predict(data['name'])

ValueError: could not convert string to float: 'Danielle'

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

# # Define the parameter grid for each model
# param_grid_nb = {"alpha": [0.01, 0.1, 1, 10, 100]}

# param_grid_rf = {
#     "n_estimators": [10, 50, 100, 200],
#     "max_depth": [None, 10, 20, 30],
#     "min_samples_split": [2, 5, 10],
#     "min_samples_leaf": [1, 2, 4],
#     "max_features": ["sqrt", "log2"],
# }

# param_grid_svc = {
#     "C": [0.1, 1, 10, 100],
#     "gamma": ["scale", "auto"],
#     "kernel": ["linear", "rbf", "poly"],
# }

# param_grid_gb = {
#     "n_estimators": [100, 200, 300],
#     "learning_rate": [0.01, 0.1, 0.2],
#     "max_depth": [3, 4, 5],
# }

# # Initialize RandomizedSearchCV for each model
# random_search_cv_nb = RandomizedSearchCV(
#     MultinomialNB(), param_grid_nb, cv=5, scoring="accuracy"
# )
# random_search_cv_rf = RandomizedSearchCV(
#     RandomForestClassifier(), param_grid_rf, cv=5, scoring="accuracy"
# )
# random_search_cv_svc = RandomizedSearchCV(
#     SVC(), param_grid_svc, cv=5, scoring="accuracy"
# )
# random_search_cv_gb = RandomizedSearchCV(
#     GradientBoostingClassifier(), param_grid_gb, cv=5, scoring="accuracy"
# )

# # Perform hyperparameter tuning and fit the models
# random_search_cv_nb.fit(X_train_vectorized, y_train_encoded)
# random_search_cv_rf.fit(X_train_vectorized, y_train_encoded)
# random_search_cv_svc.fit(X_train_vectorized, y_train_encoded)
# random_search_cv_gb.fit(X_train_vectorized, y_train_encoded)

# # Get the best parameters and the best score for each model
# best_params_nb = random_search_cv_nb.best_params_
# best_score_nb = random_search_cv_nb.best_score_

# best_params_rf = random_search_cv_rf.best_params_
# best_score_rf = random_search_cv_rf.best_score_

# best_params_svc = random_search_cv_svc.best_params_
# best_score_svc = random_search_cv_svc.best_score_

# best_params_gb = random_search_cv_gb.best_params_
# best_score_gb = random_search_cv_gb.best_score_

# # Print the best parameters and the best score for each model
# print(f"Naive Bayes Best Params: {best_params_nb}, Best Score: {best_score_nb}")
# print(f"Random Forest Best Params: {best_params_rf}, Best Score: {best_score_rf}")
# print(f"SVC Best Params: {best_params_svc}, Best Score: {best_score_svc}")
# print(f"Gradient Boosting Best Params: {best_params_gb}, Best Score: {best_score_gb}")

# # Use the best estimator for further predictions
# best_model_nb = random_search_cv_nb.best_estimator_
# best_model_rf = random_search_cv_rf.best_estimator_
# best_model_svc = random_search_cv_svc.best_estimator_
# best_model_gb = random_search_cv_gb.best_estimator_

In [14]:
data = pd.read_csv("../data/dataset.csv")

In [31]:
# Predict on new data using the trained SVC model
new_features = data["name"]
new_features_vectorized = vectorizer.transform(new_features)
new_predictions_encoded = model.predict(new_features_vectorized)


In [32]:
# Decode the predictions back to the original labels
new_predictions = le.inverse_transform(new_predictions_encoded)


In [33]:
#%%
# Replace existing gender column with newly predicted one
data["gender"] = new_predictions


In [34]:
data["gender"].value_counts()

gender
F    17524
M    17477
Name: count, dtype: int64

In [35]:
data.to_csv("../data/dataset_v2.csv", index=False)

In [None]:
# Save the updated DataFrame to a new CSV file
# new_df.to_csv("data/new_dataset.csv", index=False)
# Print out the first few rows to verify
df.head()

Unnamed: 0,user_id,age,education,gender,name,country,music,artist_name,featured_artists,genre,...,tempo,time_signature,track_genre,release_date,explicit,duration,music_id,id_artists,followers,ratings
0,83811,16,Undergraduate,F,Danielle,Urban,Bank Account,21 Savage,"Birdy, Zoé",Dark Trap,...,147.482666,5,Unknown,2017-09-23,True,3.67,2fQrGHiQOvpL9UgPvtYy6G,spotify:artist:1URnnhqYAYcrqrcwql10ft,440898,1.979877
1,83811,16,Undergraduate,F,Danielle,Urban,Mo Money Mo Problems (feat. Mase & Puff Daddy)...,The Notorious B.I.G.,LUDMILLA,Underground Rap,...,104.536,4,hardcore,1997-03-04,False,4.29,4INDiWSKvqSKDEu7mh8HFz,spotify:artist:5me0Irg2ANcsgc93uaYrpb,849749,4.446776
2,83811,16,Undergraduate,F,Danielle,Urban,Little Talks,Of Monsters and Men,"Ninho, Snoop Dogg, Russ, Paramore",Unknown,...,102.961,4,folk,2012-01-01,True,4.44,2ihCaVdNZmnHZWt0fvAM7B,spotify:artist:4dwdTW1Lfiq0cM8nBAqIIz,733052,4.193334
3,13397,17,Middle School,M,Angel,Non-Urban,Wherever I Go,OneRepublic,"Keith Urban, DJ Khaled, NIKI, MF DOOM",Unknown,...,99.961,4,piano,2016-12-02,True,2.83,46jLy47W8rkf8rEX04gMKB,spotify:artist:5Pwc4xIPtQLFEnJriah9YJ,766179,3.311571
4,13397,17,Middle School,M,Angel,Non-Urban,Hands To Myself,Selena Gomez,"SAINt JHN, David Bisbal, will.i.am",Unknown,...,84.918633,3,Unknown,2015-10-09,True,3.34,3CJvmtWw2bJsudbAC5uCQk,spotify:artist:0C8ZW7ezQVs4URX5aX7Kqx,399591,1.0


In [None]:
# df = pd.read_csv('data/dataset.csv')

In [None]:
# # Apply a log transformation to reduce skewness in the 'plays' column
# df["log_plays"] = np.log1p(df["plays"])
# # Initialize the MinMaxScaler to scale between 1 and 5
# scaler = MinMaxScaler(feature_range=(1, 5))

# # Fit and transform the 'log_plays' data
# df["ratings"] = scaler.fit_transform(df[["log_plays"]])

# # Drop the 'log_plays' column as it's no longer needed
# df.drop("log_plays", axis=1, inplace=True)

# # Handle missing values
# df["featured_artists"] = df["featured_artists"].fillna("None")
# df["genre"] = df["genre"].fillna("Unknown")

In [None]:
# df.to_csv("data/music_data.csv", index=False)