In [35]:
import numpy as np
import pandas as pd
import os
import re
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from catboost import CatBoostClassifier, CatBoostRegressor, CatBoost, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from pymorphy2 import MorphAnalyzer
from nltk import sent_tokenize, word_tokenize, regexp_tokenize


In [36]:
%%time
train = pd.read_csv('E:/juniper_notebooks/Task2_HH/input/train.csv', low_memory=False)
test = pd.read_csv('E:/juniper_notebooks/Task2_HH/input/test.csv', low_memory=False)

Wall time: 1min 57s


In [37]:
%%time
def tokenize_n_normalize(sent, pat=r"(?u)\b\w\w+\b", morph=MorphAnalyzer()):
    return [morph.parse(tok)[0].normal_form 
            for tok in regexp_tokenize(sent, pat)]

def text_encoding(train: pd.DataFrame, test: pd.DataFrame, f):
    """
    col_definition: encode_col
    """
    n_train = len(train)
    train = pd.concat([train, test], sort=False).reset_index(drop=True)
    
    train[f] = train[f].fillna('')
    train[f] = train[f].map(lambda x: " ".join(tokenize_n_normalize(x)))
    vectorizer=TfidfVectorizer(min_df=0.002)
    X_word = vectorizer.fit_transform(train[f])
        
    test  = X_word[n_train:]#.reset_index(drop=True)
    train = X_word[:n_train]
    
    return train, test

newtrain, newtest = text_encoding(train, test, 'responsibilities')

Wall time: 1h 57min 38s
Parser   : 127 ms


In [38]:
def label_encoding(train: pd.DataFrame, test: pd.DataFrame, col_definition):
   
    n_train = len(train)
    train = pd.concat([train, test], sort=False).reset_index(drop=True)
    train[col_definition] = train[col_definition].fillna('')    
    
    enc = DictVectorizer()
    X_cal = enc.fit_transform(train[col_definition].to_dict('records'))
    
    test = X_cal[n_train:]
    train = X_cal[:n_train]
    return train, test

In [39]:
%time
categorical_cols = ['requirements_id_priority_category',
                    'is_uzbekistan_recruitment',
                    'source',
                    'industry',
                    'education_requirements_education_type',
                    'experience_requirements',
                    'id_hiring_organization',
                    'profession',
                    'region']

col_train, col_newtest = label_encoding(train, test, categorical_cols)

Wall time: 0 ns


In [40]:
X_train = hstack((newtrain,col_train)) 
X_test = hstack((newtest,col_newtest)) 

In [41]:
X_train = X_train.tocsr()

In [1]:
%%time
num_t = 800000
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train[:num_t],train['mean_salary'][:num_t],test_size=0.10, random_state = 0)
train_pool = Pool(
    X_train1, 
    y_train1
    
)
valid_pool = Pool(
    X_test1, 
    y_test1
)

catboost_params = {
    'learning_rate': 0.9999,
    'eval_metric': 'MAE',
    'iterations': 20000,
    'use_best_model': True,
    'verbose': False,
    'early_stopping_rounds': 100,
    'depth': 6
}
model = CatBoostRegressor(**catboost_params)
model.fit(train_pool, eval_set=valid_pool, plot=True)


NameError: name 'train_test_split' is not defined

In [43]:
y_pred = model.predict(X_train[-40000:])
y_test = train['mean_salary'][-40000:]

In [44]:
print(mean_absolute_error(y_test, y_pred))

5172.396911516417


In [25]:
#5547 1.5#############################################################################################и

In [45]:
y_pred = model.predict(X_test)

In [46]:
sub = pd.read_csv('E:/juniper_notebooks/Task2_HH/input/sample_submission.csv',index_col = 'id')
sub['mean_salary'] = y_pred
sub.to_csv('submission.csv')