In [21]:
%config Completer.use_jedi = False

In [22]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
import datetime

from scripts.multiple_logging import setup_logger
from scripts.utils import convert_ids_to_ordered, MovingAverage
from scripts.mnap import compute_mnap
tqdm.pandas()

In [30]:
aspects = pd.read_csv('data/aspects.csv').set_index("aspect_id")
features = pd.read_csv('data/features.csv').set_index('feature_id')
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
rubrics = pd.read_csv('data/rubrics.csv').set_index('rubric_id')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

  exec(code_obj, self.user_global_ns, self.user_ns)


#### Preprocessing the reviews

In [29]:
reviews[reviews.rating.notna()]

Unnamed: 0,user_id,org_id,rating,ts,aspects
0,16998268288908323644,7184895086928047809,2,105,
1,12235230637700316274,11420440322893824394,3,890,17
2,2706795762761414590,11993738663105455885,3,936,
3,2522006254806484630,18200844491365025705,3,1048,6
4,2035736119522953868,7143808482480365209,5,1185,
...,...,...,...,...,...
3640830,7482061050271556596,1018922217068317904,2,378,
3640831,7482061050271556596,1018922217068317904,2,378,
3640832,8928726821194342760,12046097390037935713,4,424,
3640833,13307262877386644281,4362229515870954070,5,618,


In [28]:
reviews

Unnamed: 0,user_id,org_id,rating,ts,aspects
0,16998268288908323644,7184895086928047809,2,105,
1,12235230637700316274,11420440322893824394,3,890,17
2,2706795762761414590,11993738663105455885,3,936,
3,2522006254806484630,18200844491365025705,3,1048,6
4,2035736119522953868,7143808482480365209,5,1185,
...,...,...,...,...,...
3640830,7482061050271556596,1018922217068317904,2,378,
3640831,7482061050271556596,1018922217068317904,2,378,
3640832,8928726821194342760,12046097390037935713,4,424,
3640833,13307262877386644281,4362229515870954070,5,618,


In [4]:
reviews = reviews[reviews.rating.notna()]
reviews['rating'] = reviews['rating'].apply(int)
reviews['org_id'] = reviews['org_id'].astype(str)
reviews['user_id'] = reviews['user_id'].astype(str)
organizations.index = organizations.index.astype(str)
users.index = users.index.astype(str)
test_users.index = test_users.index.astype(str)
rubrics.index = rubrics.index.astype(str)
aspects.index = aspects.index.astype(str)
### DONT FORGET TO CONVERT OTHER DATAFRAMES TO STR

In [5]:
rated = reviews.groupby('user_id').rating.count()
good_users = rated[rated > 10].index
rated_orgs = reviews.groupby('org_id').rating.count()
good_orgs = rated_orgs[rated_orgs > 10].index

In [6]:
cleaned_reviews = reviews.query('user_id in @good_users and org_id in @good_orgs')

In [7]:
users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)
n_users = len(users_ordered)
n_orgs = len(orgs_ordered)

In [8]:
orgs_rubrics = orgs_ordered[['rubrics_id']]
for i in range(len(rubrics)):
    orgs_rubrics[rubrics.rubric_name.iloc[i]] = orgs_rubrics['rubrics_id'].apply(lambda x: str(rubrics.index[i]) in x.split())
orgs_rubrics = orgs_rubrics.drop('rubrics_id', axis=1)
orgs_rubrics = orgs_rubrics.join(orgs_ordered['ordered_id']).set_index('ordered_id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orgs_rubrics[rubrics.rubric_name.iloc[i]] = orgs_rubrics['rubrics_id'].apply(lambda x: str(rubrics.index[i]) in x.split())


In [9]:
orgs_features = orgs_ordered[['features_id']].astype(str)
for i in range(len(features)):
    orgs_features[features.feature_name.iloc[i]] = orgs_features['features_id'].apply(lambda x: str(features.index[i]) in x.split())
orgs_features = orgs_features.drop('features_id', axis=1)
orgs_features = orgs_features.join(orgs_ordered['ordered_id']).set_index('ordered_id')

In [10]:
validation_split_day = 1050
train_reviews = reviews_ordered[reviews_ordered.ts < validation_split_day]
test_reviews = reviews_ordered[reviews_ordered.ts >= validation_split_day]

#### Convert to sparce matrix

In [11]:
import scipy.sparse as sp
last_user = max(reviews_ordered.ordered_id_user)

def sp_indeces(df):
    df['user_index'] = df['ordered_id_user']
    df['org_index'] = df['ordered_id_org'] + last_user + 1
    

sp_indeces(train_reviews)
sp_indeces(test_reviews)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_index'] = df['ordered_id_user']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['org_index'] = df['ordered_id_org'] + last_user + 1


In [12]:
def make_sparse(df):
    length = len(df)
    coo = sp.coo_matrix(
        ([1] * 2 * length, 
        (
            list(range(length)) + list(range(length)),
            list(df['org_index']) + list(df['user_index'])
        )),
        shape=(length, n_users+n_orgs)
    )
    sparse_rubrics = sp.coo_matrix(df.join(orgs_rubrics, on='ordered_id_org').loc[:, 'Булочная, пекарня':].astype(float))
    sparse_features = sp.coo_matrix(df.join(orgs_features, on='ordered_id_org').loc[:, 'karaoke':].astype(float))
    return sp.hstack([coo, sparse_rubrics, sparse_features])
    
X_train = make_sparse(train_reviews)
X_test = make_sparse(test_reviews)
y_train = train_reviews['rating']
y_test = test_reviews['rating']

In [13]:
os.makedirs("sparse_data/", exist_ok=True)

In [14]:
import scipy.sparse as sp
sp.save_npz('sparse_data/X_train.npz', X_train)
sp.save_npz('sparse_data/X_test.npz', X_test)

In [15]:
y_train.to_csv('sparse_data/y_train.csv', index=False)
y_test.to_csv('sparse_data/y_test.csv', index=False)

## Generate data for submission run

In [16]:
test_users = test_users.join(users_ordered)
msk_mask = np.array(orgs_ordered.city == 'msk')

In [17]:
orgs_ordered['other_city'] = orgs_ordered['city'].apply(lambda x: 'spb' if x == 'msk' else 'msk')

In [18]:
submission_reviews = []
for i in tqdm(range(len(test_users) // 500 + 1)):
    submission_review_batch = test_users.iloc[i*500:(i+1)*500].merge(
        orgs_ordered[['other_city', 'ordered_id']], how='inner',
        left_on='city', right_on='other_city', suffixes=('_user', '_org'))
    sp_indeces(submission_review_batch)
    submission_reviews.append(make_sparse(submission_review_batch))

100%|██████████| 34/34 [18:41<00:00, 32.99s/it]


In [19]:
os.makedirs("sparse_data/submission_batches_large/", exist_ok=True)

In [20]:
for i in tqdm(range(len(submission_reviews))):
    sp.save_npz(f'sparse_data/submission_batches_large/batch{i}.npz', submission_reviews[i])

100%|██████████| 34/34 [29:37<00:00, 52.27s/it]
