#### Import and preprocess data

In [160]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
import joblib

encoder = OneHotEncoder()
le = LabelEncoder()

In [161]:
data = pd.read_csv('genders.csv')

# Assign 'Gender' column to y
y = data[["Gender"]]
X = data.drop('Gender', axis=1)
# Encode the 'Gender' column
y = le.fit_transform(y["Gender"])
X = encoder.fit_transform(X)
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


#### Do ML training splits

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Develop models

In [163]:
# Define models
model_1 = RandomForestClassifier()
model_2 = LogisticRegression()
model_3 = svm.SVC()
model_4 = MLPClassifier(max_iter=1000)


model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)
model_4.fit(X_train, y_train)

In [164]:
# Evaluate the models
predictions_1 = model_1.predict(X_test)
print('Accuracy 1:', accuracy_score(y_test, predictions_1))

predictions_2 = model_2.predict(X_test)
print('Accuracy 2:', accuracy_score(y_test, predictions_2))

predictions_3 = model_3.predict(X_test)
print('Accuracy 3:', accuracy_score(y_test, predictions_3))

predictions_4 = model_4.predict(X_test)
print('Accuracy 4:', accuracy_score(y_test, predictions_4))

# Assuming 'model' is the machine learning model you have chosen
scores_1 = cross_val_score(model_1, X, y, cv=2)  # cv is the number of folds (k)
print('Cross-Validation Accuracy Scores 1', scores_1)

# Assuming 'model' is the machine learning model you have chosen
scores_2 = cross_val_score(model_2, X, y, cv=2)  # cv is the number of folds (k)
print('Cross-Validation Accuracy Scores 2', scores_2)

# Assuming 'model' is the machine learning model you have chosen
scores_3 = cross_val_score(model_3, X, y, cv=2)  # cv is the number of folds (k)
print('Cross-Validation Accuracy Scores 3', scores_3)

# Assuming 'model' is the machine learning model you have chosen
scores_4 = cross_val_score(model_4, X, y, cv=2)  # cv is the number of folds (k)
print('Cross-Validation Accuracy Scores 4', scores_4)

Accuracy 1: 0.5
Accuracy 2: 0.7142857142857143
Accuracy 3: 0.7142857142857143
Accuracy 4: 0.7142857142857143
Cross-Validation Accuracy Scores 1 [0.60606061 0.51515152]
Cross-Validation Accuracy Scores 2 [0.57575758 0.48484848]
Cross-Validation Accuracy Scores 3 [0.63636364 0.45454545]
Cross-Validation Accuracy Scores 4 [0.63636364 0.51515152]


In [165]:
# Step 7: Save the model
joblib.dump(model_2, 'model.pkl')
joblib.dump(le, 'le.pkl')
joblib.dump(encoder, 'encoder.pkl')

['encoder.pkl']