In [4]:
import pandas as pd
import joblib

## **Загрузка датасета**

Загрузим набор данных test_data.csv.

In [8]:
df = pd.read_csv('test_data.csv')

In [9]:
df

Unnamed: 0,user_id,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,region_id,browser,os
0,c2802dadd33d8ae09bb366bdd41212ea,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,36e3f3,chrome,android
1,e5b1988db74527ec092f28b0bbfdaac9,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,8ccc01,chrome,android
2,6ef1eedbdb72554e53e69782066065c5,-7307,11682,9741,13564,13577,1200,10169,16461,-3932,3340,1fbfa5,chrome,android
3,7e057293ecae62985a327b7af51858ea,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,f66ff,chrome,android
4,a27bd7ce8828497823fa8d5d05e7bbf7,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,245864,chrome,android
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97497,6f107249e2712bbbb0a7f570ba9df354,17213,5295,11609,6110,6486,6537,13182,9876,4621,-13610,unknown,chrome,android
97498,ebd1057e99ffdc05159a04b603bf7853,11702,-4540,10118,-5,9443,3388,-2603,8941,2579,24818,36e3f3,chrome,android
97499,ee0e160fa1d9e81dc4c2282264c5008e,-3467,13781,5375,8672,11148,6967,16862,14771,-1637,6310,33ed7a,yandex,android
97500,0467ee1c793c11e62d912f5ac89437d7,13459,-6413,13160,13187,5498,3875,13246,10259,-1630,-14647,other,chrome,android


## **Предсказание пола пользователей**

Для начала загрузим обученную модель и кодировщик.

In [20]:
# ! wget https://raw.githubusercontent.com/kasim-04/user-gender-prediction/main/models/encoder.joblib

In [21]:
# ! wget https://raw.githubusercontent.com/kasim-04/user-gender-prediction/main/models/model.joblib

In [22]:
# ! wget https://raw.githubusercontent.com/kasim-04/user-gender-prediction/main/src/random_forest_from_scratch.py

In [23]:
encoder = joblib.load('encoder.joblib')
classifier = joblib.load('model.joblib')

Для агрегации предсказаний по пользователю напишем функцию `predict`.

In [24]:
def predict(model, users, X, threshold=0.5):
    """
    Predicts user-level binary classes by aggregating sample probabilities using the median.

    Parameters
    ----------
    model : object
        A trained classifier implementing the `predict_proba(X)` method.
    users : array-like of shape (n_samples,)
        User identifiers corresponding to each sample.
    X : ndarray of shape (n_samples, n_features)
        Feature matrix for which predictions are made.
    threshold : float, optional, default=0.5
        Probability threshold for assigning the positive class.

    Returns
    -------
    pandas.DataFrame
        A DataFrame with the following columns:
        - `user_id`: the user identifier.
        - `y_pred`: the predicted binary class (0 or 1) based on the median probability.
    """
    y_prob = model.predict_proba(X)

    results = pd.DataFrame({'user_id': users, 'target': y_prob})
    results = results.groupby('user_id')['target'].median().reset_index()
    results['target'] = results['target'].apply(lambda x: 1 if x >= threshold else 0)

    return results

Подготовим данные для предсказания.

In [26]:
X = df.drop(columns=['user_id'])
users = df['user_id']

In [27]:
X[['region_id', 'browser', 'os']] = encoder.transform(X[['region_id', 'browser', 'os']])

Выполним предсказание пола.

In [28]:
results_df = predict(classifier, users, X)

In [29]:
results_df

Unnamed: 0,user_id,target
0,000098f8aa8b68a125148caff0a02827,1
1,0000eae7d06e8c6033731d1c1ba0c382,0
2,0000f7fa001a8d49b8e83b91a1215e17,1
3,0001c39409954fe4b4148a23154fa905,0
4,00029eeaef3671876a5d119acff7e792,0
...,...,...
84995,fffbf68a27011fbe76a7a856649c888f,1
84996,fffbf8879e9a31b944237fc24895aead,0
84997,fffc4301db1d1760389be359a13bc740,1
84998,fffd7f18a1234ea36ad7ceeeb50cf7ab,0


Сохраним результаты.

In [30]:
results_df.to_csv('results.csv', index=False)