In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from utils import *
from gender_classifier_pipeline import GenderClassifierPipeline


classifier_pipeline = GenderClassifierPipeline()

In [7]:
zipped_dataset = get_zip_if_exists(os.getcwd())

if zipped_dataset is not None:
    dataset = load_df_from_zip(zipped_dataset)
else:
    dataset = pd.read_csv('C:\\Users\\johnp\\Downloads\\profiles.csv')
    dataset = classifier_pipeline.get_data_pipeline().fit_transform(dataset)
    save_df_to_zip(df=dataset, filename='ok_cupid')

In [17]:
X = dataset['text']
y = dataset['sex']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
distributions = {
    'model__n_estimators': np.arange(10, 101, 10),
    'model__max_depth': [None] + list(np.arange(5, 21, 5)),
    'model__min_samples_split': np.arange(2, 11),
    'model__min_samples_leaf': np.arange(1, 5),
}

best_params = classifier_pipeline.fine_tune_pipeline(params=distributions, X_train=X_train, y_train=y_train)

In [21]:
model_pipeline = classifier_pipeline.get_model_pipeline(params=best_params)
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)

evaluate_prediction(y_test, y_pred)

In [23]:
classifier_pipeline.save_model_pipeline(model_pipeline=model_pipeline, 
                                        filename='ok_cupid_gender_classifier')