In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import glob
import pandas as pd

text_files = glob.glob('data/*.txt')
dfs = []

for file in text_files:
    df = pd.read_csv(file, sep=',', names=['Al','Gender', 'Dob', 'Name', 'Age'])
    dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)
combined_df = combined_df[['Name', 'Gender']]
combined_df = combined_df[:100000]
print(combined_df)

           Name Gender
0          Mary      F
1         Annie      F
2          Anna      F
3      Margaret      F
4         Helen      F
...         ...    ...
99995    Hallie      F
99996     Jamie      F
99997     Jayda      F
99998     Jessa      F
99999   Johanna      F

[100000 rows x 2 columns]


In [3]:
x = combined_df['Name']
y = combined_df['Gender']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
vectorizer = CountVectorizer()

model = make_pipeline(vectorizer, GaussianNB())


X_train_dense = vectorizer.fit_transform(X_train).toarray()
X_test_dense = vectorizer.transform(X_test).toarray()

model = GaussianNB()
model.fit(X_train_dense, y_train)

y_pred = model.predict(X_test_dense)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9414


In [7]:
new_names = ['Moses', 'Emily', 'Michael']

new_names_sparse = vectorizer.transform(new_names).toarray()

predictions = model.predict(new_names_sparse)

for name, prediction in zip(new_names, predictions):
    print(f"Predicted gender for {name}: {prediction}")


Predicted gender for Moses: M
Predicted gender for Emily: F
Predicted gender for Michael: M


In [8]:
from flask import Flask, request, jsonify
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict_gender():
    data = request.get_json()
    names = data.get('names', [])
    
    names_sparse = vectorizer.transform(names).toarray()

    predictions = model.predict(names_sparse)

    results = [{'name': name, 'gender': prediction} for name, prediction in zip(names, predictions)]

    return jsonify(results)
if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [09/Apr/2024 00:48:51] "POST /predict HTTP/1.1" 200 -


['Success', 'Emily', 'Michael']
