# Отработка модели #

In [2]:
import pickle

import pandas as pd
import numpy as np

##### pip install category_encoders
import category_encoders as ce

##### pip install scikit-learn
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

##### pip install imblearn
from imblearn.over_sampling import SMOTE

In [3]:
ini_df = pd.read_csv('data/anketa_new.csv', sep='^')
ini_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             322 non-null    object
 1   id               322 non-null    object
 2   utc              322 non-null    object
 3   steck            321 non-null    object
 4   spec             322 non-null    object
 5   role             322 non-null    object
 6   role_in          322 non-null    object
 7   hour_per_week    322 non-null    object
 8   other_courses    322 non-null    object
 9   time_of_studies  322 non-null    object
 10  notes            99 non-null     object
 11  language         205 non-null    object
 12  in_chat          322 non-null    object
 13  out              31 non-null     object
dtypes: object(14)
memory usage: 35.3+ KB


In [5]:
ini_df['out'].value_counts()

out
Выбыл    31
Name: count, dtype: int64

In [6]:
ini_df['out'] = ini_df['out'].apply(lambda x: 1 if x == 'Выбыл' else 0)

In [7]:
ini_df['out'].value_counts()

out
0    291
1     31
Name: count, dtype: int64

In [8]:
ini_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             322 non-null    object
 1   id               322 non-null    object
 2   utc              322 non-null    object
 3   steck            321 non-null    object
 4   spec             322 non-null    object
 5   role             322 non-null    object
 6   role_in          322 non-null    object
 7   hour_per_week    322 non-null    object
 8   other_courses    322 non-null    object
 9   time_of_studies  322 non-null    object
 10  notes            99 non-null     object
 11  language         205 non-null    object
 12  in_chat          322 non-null    object
 13  out              322 non-null    int64 
dtypes: int64(1), object(13)
memory usage: 35.3+ KB


### Дата-время ###

In [9]:
def to_predict(row):
    data=row
    data['date'] = pd.to_datetime(data['date'], dayfirst=True)
    data['day_name'] = data['date'].dt.day_name()
    data['day_num'] = data['date'].dt.day_of_week
    data['hour_of_day'] = data['date'].dt.hour

    to_fill = {
        'day_name': 'Friday',
        'day_num': data['day_num'].median(),
        'hour_of_day': data['hour_of_day'].median()
        }
    data = data.fillna(to_fill)

    def get_period(x):
        if 0 < x < 11:
            return 'morning'
        elif 11 < x < 17:
            return 'day'
        elif 17 < x < 23:
            return 'evening'
        else:
            return 'night'

    data['period_of_day'] = data['hour_of_day'].apply(get_period)
    data['period_of_day'].value_counts()
    ##### date_df_dum = pd.get_dummies(date_df, columns=['day_name', 'period_of_day'])
    date_df_dum = pd.get_dummies(data, columns=['day_name', 'period_of_day'])

    # ЯП

    data['language'].fillna('нуль')
    def get_java(x):
        if str(x).lower() == 'js':
            return 'java'
        else:
            return str(x).lower()

    data['language'] = data['language'].apply(get_java)

    languages = [
        'python',
        'js',
        'java',
        'c#',
        'golang',
        'php',
        'c++',
        'flutter',
        'qa',
        'sql'
        ]

    for lang in languages:
        data[lang] = data['language'].apply(lambda x: 1 if lang in str(x).lower() else 0)

    # Роль

    def get_role(x):
        if 'backend' in str(x).lower() or 'back' in str(x).lower() or 'бэк' in str(x).lower():
            return 'backend'
        elif 'frontend' in str(x).lower() or 'front' in str(x).lower() or 'фронт' in str(x).lower():
            return 'frontend'
        elif 'аналит' in str(x).lower() or 'analys' in str(x).lower() or 'ba' in str(x).lower():
            return 'аналитик'
        elif 'ds' in str(x).lower() or 'scient' in str(x).lower():
            return 'data scientist'
        elif 'дизайн' in str(x).lower() or 'design' in str(x).lower() or 'UX' in str(x).lower():
            return 'дизайнер'
        elif 'project' in str(x).lower() or 'проект' in str(x).lower():
            return 'project manager'
        elif 'android' in str(x).lower():
            return 'android'
        elif 'ios' in str(x).lower():
            return 'ios'
        elif 'full' in str(x).lower() or 'develop' in str(x).lower() or 'разраб' in str(x).lower() or 'программ' in str(x).lower():
            return 'fullstack'
        elif 'админ' in str(x).lower():
            return 'системный администратор'
        elif 'dev' in str(x).lower():
            return 'devops'
        elif 'qa' in str(x).lower() or 'тест' in str(x).lower() or 'test' in str(x).lower():
            return 'тестировщик'
        else:
            return 'other'

    data['role_in_new'] = data['role_in'].apply(get_role)
    data['test_role'] = data['role'].apply(get_role)

    def to_compar(row):
        if str(row['test_role']).lower() == str(row['role_in_new']).lower():
            return 1
        else:
            return 0

    data['compar_role'] = data.apply(to_compar, axis=1)

    # Время на практику

    def get_time(x):
        if '10+ часов' in x:
            return 0
        if '20+ часов' in x:
            return 1
        if 'готов работать 25/8' in x:
            return 2

    data['time_par_week'] = data['hour_per_week'].apply(get_time)

    # Чистка и кодирование

    df = pd.get_dummies(data, columns=['day_name', 'period_of_day'])
    df = df.drop(['hour_of_day', 'qa', 'role_in', 'notes'], axis=1)

    bin_encoder = ce.BinaryEncoder(cols=['role', 'role_in'])
    bin = bin_encoder.fit_transform(data[['role', 'role_in']])
    df = pd.concat([df, bin], axis=1)

    object_columns = [s for s in df.columns if df[s].dtypes == 'object']
    df.drop(object_columns, axis = 1, inplace=True)

    X = df.drop(['out', 'date'], axis=1)
    y = df['out']

    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    sm = SMOTE(random_state=42)
    X_train_s, y_train_s = sm.fit_resample(X_train, y_train)

    return X_train_s, y_train_s

In [10]:
X, y = to_predict(ini_df)

  data['date'] = pd.to_datetime(data['date'], dayfirst=True)


In [11]:
with open('data/test_pipe.pkl', 'rb') as pkl_file:
    test_model = pickle.load(pkl_file)
test_model

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [12]:
predict = pd.DataFrame(test_model.predict_proba(X)[:, 1])

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- ja
- sum_out
