In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from spacy.lang.ru.stop_words import STOP_WORDS

In [2]:
df_inter = pd.read_csv('./mod_data/interactions.csv')
df_items = pd.read_csv('./mod_data/items.csv')
df_users = pd.read_csv('./mod_data/users.csv')

## Items encoding

In [3]:
df_items.sample(1)

Unnamed: 0,item_id,content_type,title,genres,age_rating,keywords
11737,5537,film,Сокровища планеты. Корсика. Сокровища острова,документальное,12,"2018, франция, сокровища, планеты, корсика, ос..."


In [4]:
# keywords extraction
def tokenize(line):
    vec = [word for word in line.split(', ') if not word.isnumeric() and word not in {'', 'nan'}]
    return vec

keywords_vectorizer = TfidfVectorizer(
    tokenizer=tokenize, token_pattern=None,
    max_features=100, stop_words=list(STOP_WORDS)
)
X_keywords = keywords_vectorizer.fit_transform(df_items['keywords'].values.astype('U'))

# content type preprocessing
X_content_type = (df_items['content_type'] == 'film').astype(int).values.reshape(-1, 1)

# age rating preprocessing
age_rating_encoder = OneHotEncoder()
X_age_rating = age_rating_encoder.fit_transform(df_items['age_rating'].values.astype('U').reshape(-1, 1)).toarray()

# genres preprocessing
genres_vectorizer = TfidfVectorizer(tokenizer=tokenize, token_pattern=None)
X_genres = genres_vectorizer.fit_transform(df_items['genres'].values.astype('U')).toarray()

In [5]:
pd.Series(data=np.squeeze(np.asarray(X_keywords.sum(axis=0))), index=keywords_vectorizer.get_feature_names_out())\
    .sort_values(ascending=False)[:10]

россия                     2646.475781
соединенные штаты          1650.497265
отношения                  1344.449214
франция                    1185.161904
сша                        1085.736725
ссср                        756.435969
любовь                      633.931426
дружба                      599.568120
соединенное королевство     495.680623
женщины                     490.716083
dtype: float64

In [6]:
X_items = np.hstack([
    X_content_type,
    X_genres,
    X_age_rating,
    X_keywords.todense()
])

In [7]:
X_items.shape

(15963, 202)

## Users encoding

In [8]:
df_users.sample(1)

Unnamed: 0,user_id,age,income,sex,kids_flg
484286,234815,age_18_24,income_40_60,Ж,1


In [10]:
# age extraction
age_encoder = OneHotEncoder()
X_age = age_encoder.fit_transform(df_users['age'].values.astype('U').reshape(-1, 1)).toarray()

# income extraction
income_encoder = OneHotEncoder()
X_income = income_encoder.fit_transform(df_users['income'].values.astype('U').reshape(-1, 1)).toarray()

# sex extraction
X_sex = (df_users['sex'] == 'М').astype(int).values.reshape(-1, 1)

In [11]:
X_users = np.hstack([
    X_age,
    X_income,
    X_sex,
    df_users['kids_flg'].to_numpy().reshape(-1, 1)
])

In [19]:
X_users.shape

(840197, 14)