# Data Preprocessing

On this stage I utilize [RecTools framework](https://github.com/MobileTeleSystems/RecTools) to convert the data to appropriate [format](https://rectools.readthedocs.io/en/stable/api/rectools.dataset.dataset.Dataset.html#rectools.dataset.dataset.Dataset).

In [2]:
import pickle
import pandas as pd
from rectools import Columns
from rectools.dataset import Dataset

## Prepare data for RecTools format

In [3]:
data_raw_dir = '../data/raw/'
data_interim_dir = '../data/interim/'
data_benchmark_dir = '../benchmark/data/'

In [4]:
data_filenames = [f'u{t}.{split}' for t in ['1', '2', '3', '4', '5', 'a', 'b'] for split in ['base', 'test']]
data_filenames

['u1.base',
 'u1.test',
 'u2.base',
 'u2.test',
 'u3.base',
 'u3.test',
 'u4.base',
 'u4.test',
 'u5.base',
 'u5.test',
 'ua.base',
 'ua.test',
 'ub.base',
 'ub.test']

User features

In [5]:
user_features_names = [Columns.User, 'age', 'gender', 'occupation', 'zip_code']

user_features = pd.read_csv(
    data_raw_dir + 'u.user',
    sep='|',
    names=user_features_names,
)
user_features.drop('zip_code', axis=1, inplace=True)
user_features.age = user_features.age / user_features.age.max()
final_user_features_names = user_features.drop(Columns.User, axis=1).columns.to_list()

user_features.head()

Unnamed: 0,user_id,age,gender,occupation
0,1,0.328767,M,technician
1,2,0.726027,F,other
2,3,0.315068,M,writer
3,4,0.328767,M,technician
4,5,0.452055,F,other


Item features

In [6]:
genres = pd.read_csv(
    data_raw_dir + 'u.genre',
    sep='|',
    names=['genre', 'genre_id'],
)
genre_names = genres.genre.unique()
item_dates = ['release_date', 'video_release_date']
item_features_names = ['id', 'title', *item_dates, 'IMDB_URL', *genre_names]

item_features = pd.read_csv(
    data_raw_dir + 'u.item',
    encoding='latin-1',
    sep='|',
    names=item_features_names,
    parse_dates=item_dates,
)
item_features.drop(['title', *item_dates, 'IMDB_URL'], axis=1, inplace=True)
final_item_features_names = genre_names
item_features.head()

Unnamed: 0,id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [7]:
for filename in data_filenames:
    df = pd.read_csv(
        data_raw_dir + filename,
        sep='\t',
        names=[*Columns.Interactions],
        parse_dates=[Columns.Datetime]
    )
    df[Columns.Datetime] = pd.to_datetime(df[Columns.Datetime], unit='s')
    df[Columns.Weight] = df[Columns.Weight].astype(float)

    df_path_to_pickle = data_benchmark_dir if filename[1] in ['a', 'b'] else data_interim_dir
    df_path_to_pickle += filename + '.df.pickle'
    with open(df_path_to_pickle, 'wb') as pickle_file:
        pickle.dump(df, pickle_file)
    
    # Choose only those ids that are in the interactions df 
    user_features_modified = user_features.loc[user_features[Columns.User].isin(df[Columns.User])].copy()
    item_features_modified = item_features.loc[item_features['id'].isin(df[Columns.Item])].copy()
    # Squeeze features
    user_features_frames = []
    for feature in final_user_features_names:
        feature_frame = user_features_modified.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features_modified = pd.concat(user_features_frames)

    dataset = Dataset.construct(df,
                                user_features_df=user_features_modified,
                                cat_user_features=['gender', 'occupation'],  # Will be one-hot encoded
                                item_features_df=item_features_modified,
                                make_dense_item_features=True,  # Since all features are numeric
                                )

    ds_path_to_pickle = data_benchmark_dir if filename[1] in ['a', 'b'] else data_interim_dir
    ds_path_to_pickle += filename + '.pickle'
    with open(ds_path_to_pickle, 'wb') as pickle_file:
        pickle.dump(dataset, pickle_file)