In [None]:
!wget https://lodmedia.hb.bizmrg.com/case_files/802656/train_dataset_train.zip
!unzip ./train_dataset_train.zip
!pip install pymorphy2
!pip install implicit -U

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import dbscan
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier, RidgeClassifier, LogisticRegressionCV,Ridge,QuantileRegressor,PassiveAggressiveClassifier
from sklearn.ensemble import ExtraTreesRegressor,ExtraTreesClassifier,RandomForestClassifier,VotingClassifier,RandomForestRegressor,GradientBoostingClassifier,StackingRegressor,BaggingClassifier
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.svm import LinearSVC,LinearSVR,SVR
from sklearn.decomposition import TruncatedSVD,PCA,FactorAnalysis,IncrementalPCA,FastICA,KernelPCA,NMF,LatentDirichletAllocation
from sklearn.preprocessing import RobustScaler,QuantileTransformer,PowerTransformer,PolynomialFeatures,KBinsDiscretizer,StandardScaler,OneHotEncoder,OrdinalEncoder,FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline,FeatureUnion,TransformerMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor,LocalOutlierFactor
from sklearn.model_selection import train_test_split,ShuffleSplit,StratifiedShuffleSplit,TimeSeriesSplit,GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.dummy import DummyRegressor,DummyClassifier
from sklearn import set_config
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,roc_auc_score,accuracy_score,f1_score,classification_report
from scipy.sparse import hstack,vstack
import pymorphy2
from implicit.als import AlternatingLeastSquares

In [None]:
# считывание данных
users = pd.read_csv('./train/users.csv', sep=';', index_col=None, dtype={'age': str, 'chb': str, 'chit_type': str, 'gender': str})
items = pd.read_csv('../input/tumen1/items.csv', sep=';', index_col=None, dtype={'author': str, 'bbk': str, 'izd': str, 'sys_numb': str, 'title': str, 'year_izd': str})
train_transactions = pd.read_csv('./train/train_transactions_extended.csv', sep=';', index_col=None, dtype={'chb': str, 'date_1': str, 'is_printed': str, 'is_real': str, 'source': str, 'sys_numb': str, 'type': str})

In [None]:
# предобработка и лемматизация
items['bbk']=items['bbk'].astype('str').str.replace('\n',' ')
morph = pymorphy2.MorphAnalyzer()
def lemmatize(text): 
    words = text.split()
    res = list()
    for word in words:
        p = morph.parse(word)[0]
        res.append(p.normal_form)

    return ' '.join(res)
items['title_l']=list(map(lemmatize, items['title'].tolist()))

In [None]:
# экстракция свойств из данных книжных изданий
sparse_title_l=csr_matrix(CountVectorizer(max_df=0.1,min_df=4).fit_transform(items.title_l.astype('str')).T,shape=(38652,354355))
sparse_author=csr_matrix(CountVectorizer(max_df=0.1,min_df=4).fit_transform(items.author).T,shape=(19921,354355))
sparse_izd=csr_matrix(CountVectorizer(max_df=0.1,min_df=4).fit_transform(items.izd).T,shape=(7034,354355))
sparse_bbk=csr_matrix(CountVectorizer(lowercase=False,token_pattern='[\w|=|-]+', max_df=0.1,min_df=4).fit_transform(items.bbk.astype('str')).T,shape=(12826,354355))
sparse_year_izd=csr_matrix(CountVectorizer(max_df=0.1,min_df=4).fit_transform(items.year_izd.astype('str')).T,shape=(384,354355))

In [None]:
# определение числа уникальных читателей, создание прямых и обратных словарей chb/sys_numb
n_users = len(train_transactions['chb'].unique())

mapping_chb_index = {chb_number: index for index, chb_number in enumerate(train_transactions['chb'].unique())}
mapping_sys_numb_index = {sys_number: index for index, sys_number in enumerate(items['sys_numb'].unique())}

mapping_index_chb = {index: chb_number for index, chb_number in enumerate(train_transactions['chb'].unique())}
mapping_index_sys_numb = {index: sys_number for index, sys_number in enumerate(items['sys_numb'].unique())}

In [None]:
# делим данные на тренировочный и тестовый наборы
train_transactions = train_transactions[['chb', 'sys_numb']]
train_data, test_data = train_test_split(train_transactions, test_size=0.20,random_state=0)

In [None]:
# получение расширенной sparse матрицы user-item
def df_to_sparse(df,users):
    row = []
    col = []
    data = []

    for line in df.itertuples():
        row.append(mapping_chb_index[line.chb])
        col.append(mapping_sys_numb_index[line.sys_numb])
        data.append(1)
    return csr_matrix((data, (row, col)))

train_data_sparse = df_to_sparse(train_data,users)
test_data_sparse = df_to_sparse(test_data,users)

train_data_sparse=vstack((train_data_sparse,sparse_title_l,sparse_author,sparse_izd,sparse_bbk,sparse_yd))

In [None]:
# обучение модели
model = AlternatingLeastSquares(factors=1000,regularization = 2.0,iterations=5)
model.fit(train_data_sparse)

In [None]:
#подбор рекомендаций для всех пользователей
def get_recom(userid):
    ids, scores = model.recommend(userid, train_data_sparse[userid], N=20, filter_already_liked_items=True)
    top20recom_df = pd.DataFrame({"sys_numb": [mapping_index_sys_numb[id] for id in ids], "score": scores, "already_liked": np.in1d(ids, train_data_sparse[userid].indices)})
    return top20recom_df['sys_numb'].values

all_rec = []

for userid in tqdm(range(len(users))):
    user_chb = mapping_index_chb[userid]
    user_rec = get_recom(userid)
    for rec in user_rec:
        all_rec.append([user_chb, rec])



In [None]:
# сохранение итогового прогноза
solution = pd.DataFrame(all_rec, columns=["chb", "sys_numb"])
solution.to_csv("solution.csv", index=False, sep=';')