# Отработка модели #

In [1]:
import pickle

import sqlalchemy as sa
from sqlalchemy.orm import declarative_base, Session

import pandas as pd
import numpy as np

##### pip install category_encoders
import category_encoders as ce

##### pip install scikit-learn
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
##### from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

##### pip install imblearn
from imblearn.over_sampling import SMOTE

In [2]:
# Временная база для модели
MODEL_DB_PATH = 'sqlite:///data/mod_1.sqlite3'
ModelBase = declarative_base()
MODEL_ENGINE = sa.create_engine(MODEL_DB_PATH)

class Model_Base(ModelBase):
    __tablename__ = 'Students'
    p_key = sa.Column(sa.Integer(), nullable=False, unique=True, primary_key=True, autoincrement=True)
    date = sa.Column(sa.String())
    id = sa.Column(sa.Text())
    utc = sa.Column(sa.String())
    steck = sa.Column(sa.Text())
    spec = sa.Column(sa.Text())
    role = sa.Column(sa.Text())
    role_in = sa.Column(sa.Text())
    hour_per_week = sa.Column(sa.Text())
    other_courses = sa.Column(sa.Text())
    time_of_studies = sa.Column(sa.Text())
    notes = sa.Column(sa.Text())
    language = sa.Column(sa.String())
    in_chat = sa.Column(sa.String())
    fin_pred = sa.Column(sa.Float())

ModelBase.metadata.create_all(MODEL_ENGINE)

try:
    with Session(MODEL_ENGINE) as s_:
        s_.query(Model_Base).filter(Model_Base.p_key >= 1).delete(synchronize_session='fetch')
        s_.commit()
    print('Table cleared')
except:
    print('Something wrong')

Table cleared


In [3]:
ini_df = pd.read_csv('data/anketa_new.csv', sep='^')
ini_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             322 non-null    object
 1   id               322 non-null    object
 2   utc              322 non-null    object
 3   steck            321 non-null    object
 4   spec             322 non-null    object
 5   role             322 non-null    object
 6   role_in          322 non-null    object
 7   hour_per_week    322 non-null    object
 8   other_courses    322 non-null    object
 9   time_of_studies  322 non-null    object
 10  notes            99 non-null     object
 11  language         205 non-null    object
 12  in_chat          322 non-null    object
 13  out              31 non-null     object
dtypes: object(14)
memory usage: 35.3+ KB


In [4]:
ini_df = ini_df.drop(['out'], axis=1)

In [5]:
ini_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             322 non-null    object
 1   id               322 non-null    object
 2   utc              322 non-null    object
 3   steck            321 non-null    object
 4   spec             322 non-null    object
 5   role             322 non-null    object
 6   role_in          322 non-null    object
 7   hour_per_week    322 non-null    object
 8   other_courses    322 non-null    object
 9   time_of_studies  322 non-null    object
 10  notes            99 non-null     object
 11  language         205 non-null    object
 12  in_chat          322 non-null    object
dtypes: object(13)
memory usage: 32.8+ KB


# Новая модель #

In [6]:
def prediction(row):
    data = row
    data = data.drop(['in_chat'], axis=1) # Дроп неинформативного признака

    # Работа с датой и временем
    data['date'] = pd.to_datetime(data['date'], dayfirst=True)
    data['day_name'] = data['date'].dt.day_name()
    data['day_num'] = data['date'].dt.day_of_week
    data['hour_of_day'] = data['date'].dt.hour

    to_fill = {
        'day_name': 'Friday',
        'day_num': data['day_num'].median(),
        'hour_of_day': data['hour_of_day'].median()
        }
    data = data.fillna(to_fill)

    def get_period(x):
        if 0 < x < 11:
            return 'morning'
        elif 11 < x < 17:
            return 'day'
        elif 17 < x < 23:
            return 'evening'
        else:
            return 'night'

    data['period_of_day'] = data['hour_of_day'].apply(get_period)

    # ЯП
    data['language'].fillna('нуль') # запроняем пропуски

    def get_java(x): # выделяем джаву и джс
        if str(x).lower() == 'js':
            return 'java'
        else:
            return str(x).lower()

    data['language'] = data['language'].apply(get_java)

    languages = [
        'python',
        'js',
        'java',
        'c#',
        'golang',
        'php',
        'c++',
        'flutter',
        'qa',
        'sql'
        ]

    for lang in languages: # бинарное кодирование ЯП
        data[lang] = data['language'].apply(lambda x: 1 if lang in str(x).lower() else 0)

    # Роль

    def get_role(x):
        if 'backend' in str(x).lower() or 'back' in str(x).lower() or 'бэк' in str(x).lower():
            return 'backend'
        elif 'frontend' in str(x).lower() or 'front' in str(x).lower() or 'фронт' in str(x).lower():
            return 'frontend'
        elif 'аналит' in str(x).lower() or 'analys' in str(x).lower() or 'ba' in str(x).lower():
            return 'аналитик'
        elif 'ds' in str(x).lower() or 'scient' in str(x).lower():
            return 'data scientist'
        elif 'дизайн' in str(x).lower() or 'design' in str(x).lower() or 'UX' in str(x).lower():
            return 'дизайнер'
        elif 'project' in str(x).lower() or 'проект' in str(x).lower():
            return 'project manager'
        elif 'android' in str(x).lower():
            return 'android'
        elif 'ios' in str(x).lower():
            return 'ios'
        elif 'full' in str(x).lower() or 'develop' in str(x).lower() or 'разраб' in str(x).lower() or 'программ' in str(x).lower():
            return 'fullstack'
        elif 'админ' in str(x).lower():
            return 'системный администратор'
        elif 'dev' in str(x).lower():
            return 'devops'
        elif 'qa' in str(x).lower() or 'тест' in str(x).lower() or 'test' in str(x).lower():
            return 'тестировщик'
        else:
            return 'other'

    data['role_in_new'] = data['role_in'].apply(get_role)
    data['test_role'] = data['role'].apply(get_role)

    def to_compar(row):
        if str(row['test_role']).lower() == str(row['role_in_new']).lower():
            return 1
        else:
            return 0

    data['compar_role'] = data.apply(to_compar, axis=1)

    # Время на практику
    def get_time(x):
        if '10+ часов' in x:
            return 0
        if '20+ часов' in x:
            return 1
        if 'готов работать 25/8' in x:
            return 2

    data['time_par_week'] = data['hour_per_week'].apply(get_time)

    # работа с признаками steck и spec
    data['steck'] = data['steck'].fillna('unknown')
    data['spec'] = data['spec'].fillna('unknown')

    def text_clear(text):
        import string
        for p in string.punctuation + '\n':
            if p in text:
                text = text.replace(p, '')
        return text

    data['steck'] = data['steck'].apply(text_clear)
    data['spec'] = data['spec'].apply(text_clear)

    data['steck'] = data['steck'].apply(lambda x: x.split())
    data['spec'] = data['spec'].apply(lambda x: x.split())

    def length(iterrows):
        for row in iterrows:
            if 1 < len(row) <=2:
                return 1
            if 3 < len(row) <=4:
                return 2
            if 5 < len(row) <=7:
                return 3
            if len(row) > 7:
                return 4
            else:
                return 0

    data['steck_count'] = data['steck'].apply(length)
    data['spec'] = data['spec'].apply(length)
    data['steck_count'] = data['steck_count'].fillna(0)

    # Чистка и кодирование
    data = pd.get_dummies(data, columns=['day_name', 'period_of_day'])

    bin_encoder = ce.BinaryEncoder(cols=['role', 'role_in'])
    bin = bin_encoder.fit_transform(data[['role', 'role_in']])
    data = pd.concat([data, bin], axis=1)

    data = data.drop(['hour_of_day', 'qa', 'role_in', 'notes', 'date'], axis=1)

    object_columns = [s for s in data.columns if data[s].dtypes == 'object']
    data.drop(object_columns, axis = 1, inplace=True)

    with open('data/scaler.pkl', 'rb') as pkl_file: # уточнить адрес файла
        scaler = pickle.load(pkl_file)

    with open('data/model_lr.pkl', 'rb') as pkl_file: # уточнить адрес файла
        model_lr = pickle.load(pkl_file)

    with open('data/model_rf.pkl', 'rb') as pkl_file: # уточнить адрес файла
        model_rf = pickle.load(pkl_file)

    threshold_lr_rf = 0.45

    data_s = scaler.transform(data)

    y_lr_pred = pd.Series(model_lr.predict_proba(data_s)[:, 1])
    y_lr_class = y_lr_pred.apply(lambda x: 1 if x > threshold_lr_rf else 0)

    y_rf_pred = pd.Series(model_rf.predict_proba(data)[:, 1])
    y_rf_class = y_rf_pred.apply(lambda x: 1 if x > threshold_lr_rf else 0)

    test = pd.DataFrame({
        'rf_pred': list(y_rf_pred),
        'rf_class': y_rf_class,
        'lr_pred': list(y_lr_pred),
        'lr_class': y_lr_class
        })

    with open('data/model_dt.pkl', 'rb') as pkl_file: # уточнить адрес файла
        model_dt = pickle.load(pkl_file)

    y_dt_pred = pd.Series(model_dt.predict_proba(test)[:, 1])
    df = pd.concat([row, y_dt_pred], axis=1)
    df.columns = [
        'date',
        'id',
        'utc',
        'steck',
        'spec',
        'role',
        'role_in',
        'hour_per_week',
        'other_courses',
        'time_of_studies',
        'notes',
        'language',
        'in_chat',
        'fin_pred'
        ]
    return df

In [7]:
new_df = prediction(ini_df)

In [8]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             322 non-null    object 
 1   id               322 non-null    object 
 2   utc              322 non-null    object 
 3   steck            321 non-null    object 
 4   spec             322 non-null    object 
 5   role             322 non-null    object 
 6   role_in          322 non-null    object 
 7   hour_per_week    322 non-null    object 
 8   other_courses    322 non-null    object 
 9   time_of_studies  322 non-null    object 
 10  notes            99 non-null     object 
 11  language         205 non-null    object 
 12  in_chat          322 non-null    object 
 13  fin_pred         322 non-null    float64
dtypes: float64(1), object(13)
memory usage: 35.3+ KB


In [9]:
with Session(MODEL_ENGINE) as s_:
    for index_ in range(new_df.shape[0]):
        new_line_ = Model_Base(date=new_df.date.iloc[index_],
                               id=new_df.id.iloc[index_],
                               utc=new_df.utc.iloc[index_],
                               steck=new_df.steck.iloc[index_],
                               spec=new_df.spec.iloc[index_],
                               role=new_df.role.iloc[index_],
                               role_in=new_df.role_in.iloc[index_],
                               hour_per_week=new_df.hour_per_week.iloc[index_],
                               other_courses=new_df.other_courses.iloc[index_],
                               time_of_studies=new_df.time_of_studies.iloc[index_],
                               notes=new_df.notes.iloc[index_],
                               language=new_df.language.iloc[index_],
                               in_chat=new_df.in_chat.iloc[index_],
                               fin_pred=new_df.fin_pred.iloc[index_]
                              )
        s_.add(new_line_)
    s_.commit()

In [10]:
with Session(MODEL_ENGINE) as s_:
    out = s_.query(Model_Base).all()
    print(len(out))

322
