1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)



**Прогнозирования смены профессии**

In [2]:
#Загрузим библиотеки и посмотрим на данные
import pandas as pd
import numpy as np

df = pd.read_csv("./aug_train.csv")
df.head(3)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0


enrollee_id: уникальный идентификатор кандидата

city: Код города

city_development_index: индекс развития города (в масштабе)

gender: Пол кандидата

relvent_experience: Соответствующий опыт кандидата

enrolled_university: Тип зачисленных университетских курсов, если таковые имеются

education_level: Уровень образования кандидата

major_discipline: Обучение основной дисциплине кандидата

experience: Кандидатский общий стаж в годах

company_size: Количество сотрудников в компании текущего работодателя

company_type: Тип текущего работодателя

last_new_job: разница в годах между предыдущей работой и текущей работой

training_hours: завершенные часы обучения

target: 0 - Не ищу смены работы, 1 - Ищу смены работы

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [4]:
df['target'].value_counts()

0.0    14381
1.0     4777
Name: target, dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder

# I do this manually to explicitly tell the model that a better education & experience serves well as a trustworthy input.

# However, later we wil see the feature importanes report in SHAP and notice interesting results.
experience_dict = {'Has relevent experience' : 1,
             'No relevent experience': 0}

education_dict = {'Graduate' : 2,
             'Masters' : 1,
             'Phd' : 0}

enrollment_dict = {'no_enrollment' : 2,
             'Full time course' : 1,
             'Part time course' : 0}

gender_dict = {'Male' : 2,
             'Female' : 1,
             'Other' : 0}

discipline_dict = {'STEM' : 5,
             'Humanities' : 4,
             'Business Degree' : 3,
             'Other' : 2,
             'No Major' : 1,
             'Arts' : 0 }

company_dict = {'Pvt Ltd' : 5,
             'Funded Startup' : 4,
             'Public Sector' : 3,
             'Early Stage Startup' : 2,
             'NGO' : 1,
             'Other' : 0 }



le = LabelEncoder()
df['gender'] = df['gender'].map(gender_dict)
df['relevent_experience'] = df['relevent_experience'].map(experience_dict)
df['education_level'] = df['education_level'].map(education_dict)
df['enrolled_university'] = df['enrolled_university'].map(enrollment_dict)
df['major_discipline'] = df['major_discipline'].map(discipline_dict)
df['experience'] = le.fit_transform(df['experience'].astype(str))
df['company_size'] = le.fit_transform(df['company_size'].astype(str))
df['company_type'] = df['company_type'].map(company_dict)
df['last_new_job'] = le.fit_transform(df['last_new_job'].astype(str))


In [6]:
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,2.0,1,2.0,2.0,5.0,21,8,,0,36,1.0
1,29725,city_40,0.776,2.0,0,2.0,2.0,5.0,6,4,5.0,4,47,0.0
2,11561,city_21,0.624,,0,1.0,2.0,5.0,15,8,,6,83,0.0
3,33241,city_115,0.789,,0,,2.0,3.0,20,8,5.0,6,52,1.0
4,666,city_162,0.767,2.0,1,2.0,1.0,5.0,21,4,4.0,3,8,0.0


In [7]:
#заполним пробелы
for column in ['gender','enrolled_university','education_level','major_discipline', 'experience', 'company_size', 'company_type','last_new_job']:
    df[column].fillna(df[column].mode()[0], inplace=True)

In [8]:
df.isna().sum()

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

In [9]:
#удалим 
df = df.drop(columns=["enrollee_id"])
df = df.drop(columns=["city"])

In [10]:
df.dtypes

city_development_index    float64
gender                    float64
relevent_experience         int64
enrolled_university       float64
education_level           float64
major_discipline          float64
experience                  int32
company_size                int32
company_type              float64
last_new_job                int32
training_hours              int64
target                    float64
dtype: object

In [11]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df, df['target'], test_size=0.3, random_state=0)

In [12]:
result = []

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, classification_report
random_forest_model = RandomForestClassifier(max_depth=2, random_state= 59)
random_forest_model.fit(X_train, y_train)
y_pred_random_forest = random_forest_model.predict(X_test)
cm_random_forest = confusion_matrix(y_pred_random_forest,y_test)
fscore = f1_score(y_test,y_pred_random_forest)
acc_random_forest = accuracy_score(y_test, y_pred_random_forest)
result.append(acc_random_forest)

print("RESULTS :")
print("Random Forest Model Accuracy : ",round(acc_random_forest,2))
print("Random Forest Model F1-score : ",round(fscore,2))
print("Classification Report :",classification_report(y_test,y_pred_random_forest))
print('\n')
print('\n')

RESULTS :
Random Forest Model Accuracy :  0.99
Random Forest Model F1-score :  0.98
Classification Report :               precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      4297
         1.0       1.00      0.97      0.98      1451

    accuracy                           0.99      5748
   macro avg       0.99      0.98      0.99      5748
weighted avg       0.99      0.99      0.99      5748







In [40]:
itog = pd.DataFrame({'metod': ['RandomForest'], 'precision': [0.99],
                     'recall': [0.98],
                    'f1-score': [0.99]})
itog

Unnamed: 0,metod,precision,recall,f1-score
0,RandomForest,0.99,0.98,0.99


In [16]:
df['target'].value_counts()

0.0    14381
1.0     4777
Name: target, dtype: int64

In [20]:
#создадим копию df
df_random = df

In [21]:
#удалим стороки со значением в target = 1
df_random = df_random.drop(np.where(df['target'] == 1)[0])

In [22]:
df_random['target'].value_counts()

0.0    14381
Name: target, dtype: int64

In [23]:
df_y = df

In [24]:
df_y = df_y.drop(np.where(df['target'] == 0)[0])

In [25]:
df_y['target'].value_counts()

1.0    4777
Name: target, dtype: int64

In [27]:
X_train, X_test, y_train, y_test = train_test_split(df_random[:4777], df_y['target'], random_state=0)

In [28]:
result_n = []

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, classification_report
random_forest_model = RandomForestClassifier(max_depth=2, random_state= 59)
random_forest_model.fit(X_train, y_train)
y_pred_random_forest = random_forest_model.predict(X_test)
cm_random_forest = confusion_matrix(y_pred_random_forest,y_test)
fscore = f1_score(y_test,y_pred_random_forest)
acc_random_forest = accuracy_score(y_test, y_pred_random_forest)
result_n.append(acc_random_forest)

print("RESULTS :")
print("Random Forest Model Accuracy : ",round(acc_random_forest,2))
print("Random Forest Model F1-score : ",round(fscore,2))
print("Classification Report :",classification_report(y_test,y_pred_random_forest))
print('\n')
print('\n')

RESULTS :
Random Forest Model Accuracy :  1.0
Random Forest Model F1-score :  1.0
Classification Report :               precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      1195

    accuracy                           1.00      1195
   macro avg       1.00      1.00      1.00      1195
weighted avg       1.00      1.00      1.00      1195







In [42]:
itog = itog.append({'metod': ['RandomForest'], 'precision': [1.00],
                     'recall': [1.00],
                    'f1-score': [1.00]}, ignore_index=True)
itog

Unnamed: 0,metod,precision,recall,f1-score
0,RandomForest,0.99,0.98,0.99
1,[RandomForest],[1.0],[1.0],[1.0]
