In [59]:
import pandas as pd
import numpy as np
import inspect

In [60]:
FILENAME="df_condition.csv"

min_num_of_records = 5
min_trackable_name_count = 50
min_num_of_dates = 2

NUM_TOP_K = 1000

In [93]:


class TestData:
    @staticmethod
    def check_user_existence(df, user_id):
        """
        Check if a user with the given user_id exists in the DataFrame.

        Args:
            df (DataFrame): The DataFrame to check.
            user_id (str): The user_id to check for.

        Returns:
            bool: True if the user exists, False otherwise.
        """
        return user_id in df['user_id'].values

    @staticmethod
    def test_user_concatenation(df_clean, test_df, train_df, test_id, min_num_of_records=1):
        """
        Test the concatenation of train and test dataframes for a user.

        Args:
            df_clean (DataFrame): The clean DataFrame.
            test_df (DataFrame): The test DataFrame.
            train_df (DataFrame): The train DataFrame.
            test_id (str): The user_id to test.
            min_num_of_records (int): The minimum number of records for a valid user.

        Returns:
            bool: True if the concatenation is correct, False otherwise.
        """
        def df_id(df, test_id):
            return df.loc[df["user_id"]==test_id]

        user = df_id(df_clean, test_id)
        user_test = df_id(test_df, test_id)
        user_train= df_id(train_df, test_id)

        concatenated_user = pd.concat([user_train, user_test]).sort_index()
        return user.sort_index().equals(concatenated_user) and len(user) > min_num_of_records
    
    @staticmethod
    def run_tests(df_clean, test_df, train_df, min_num_of_records=1):
        """
        Run all tests.

        Args:
            df_clean (DataFrame): The clean DataFrame.
            test_df (DataFrame): The test DataFrame.
            train_df (DataFrame): The train DataFrame.
            min_num_of_records (int): The minimum number of records for a valid user.

        Returns:
            bool: True if all tests pass, False otherwise.
        """
        test_ids=['QEVuQwEA++z6GMJgxyjYYw0jFdXeDw==', # user with a lot of rows
                'QEVuQwEAUGIfNYeSSYSHAiACdW4/EA==', # user with a 1 row
                'QEVuQwEAmoJdC6S79c4qmNNWxXJtFA==', # user with a 2 row
                'QEVuQwEAsBPVHou+Hpfq1GeehpjG6Q==', # user NaN
                'QEVuQwEAO18wayHaoaXcyWjKktyzYA=='] # usual user

        # Проверка существования пользователей
        for test_id in test_ids:
            assert TestData.check_user_existence(df_clean, test_id)

        # Тест конкатенации пользовательских данных
        assert TestData.test_user_concatenation(df_clean, test_df, train_df, test_ids[0], min_num_of_records)
        assert not TestData.test_user_concatenation(df_clean, test_df, train_df, 'nonexistent_user_id', min_num_of_records)

        return True


In [94]:
class ALSpreparation:
    # preparation to als
    def some_ids_preparation_to_als(user_item_matrix):

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values
        
        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))
        
        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))
        
        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))
        
        return itemid_to_id, userid_to_id, id_to_itemid, id_to_userid
    
    # als_prep = ALSpreparation()
    # itemid_to_id, userid_to_id, id_to_itemid, id_to_userid = als_prep.some_ids_preparation_to_als(user_item_matrix)

In [96]:
# # Создание экземпляра класса TestData
# test_data = TestData()

# # Запуск всех тестов
# assert test_data.run_tests(df_clean, test_df, train_df)

In [83]:
# плотность матрицы
def matrix_density(df):
    return df.sum().sum() / (df.shape[0] * df.shape[1]) * 100

# поиск
def find_row_by_user(dff, user, by="user_id"):
    return dff[(dff[by] == user)]

def find_row_by_user_and_condition(dff, user, cond, by="user_id"):
    return dff[(dff[by] == user) & (dff["trackable_name"] == cond)]

def find_rows_by_column_value(df, col, value):
    return df[(df[col] == value)]


## Data Cleaning

**raw_data** - сырые данные 

**df** - очищенные данные

**mapping_matrix_id** - таблица соответствия user_id и numeric_user_id

**mapping_matrix_trackable** - таблица соответствия trackable_name и numeric_trackable_name


In [159]:
min_num_of_records = 10
min_trackable_name_count = 50
min_num_of_dates = 6

import TimeSeriesProcessor
proc = TimeSeriesProcessor.TimeSeriesDataProcessor()

def clean_df(df):
    
    # proc.list_methods()
    
    # Удаляем, где у юзеров нет age/sex/country (данных достаточно)
    df = df.dropna(subset=['age', 'sex','country'])
    # Подсчитываем количество встречаемости trackable_name
    df_count_trackable_name = proc.calculate_counttable_by_columnname(df,"trackable_name").sort_values(by='count', ascending=False)
    # Выбираем записи, где количество встречаемости trackable_name > min_trackable_name_count
    df_clean = proc.delete_rows_with_higher_count(df=df, df_count=df_count_trackable_name, min_count=min_trackable_name_count, col='trackable_name')
    # Удаляем ненужные колонки
    df_clean = proc.drop_columns_by_columnsnames(df_clean, ['count', 'trackable_type'])
    # Подсчитываем количество записей на каждого юзера и фильтруем count_of_records >= min_num_of_records
    filtered_users_with_counts = proc.filter_by_min_records(df_clean, min_num_of_records)
    # Оставляем в df_clean только отфильтрованные user_id
    df_clean = df_clean[df_clean['user_id'].isin(filtered_users_with_counts["user_id"])]
    # Фильтруем, что у пользователя количество дат записей >= min_num_of_dates
    df_clean = proc.filter_by_min_number_of_dates(df_clean, min_num_of_dates)

    # Create mapping matrices for 'user_id' and 'trackable_name'
    mapping_matrix_id, df_cleaned = proc.create_mapping_matrix(df_clean, 'user_id')
    mapping_matrix_trackable, df_cleaned = proc.create_mapping_matrix(df_clean, 'trackable_name')
   
    df_c = proc.drop_columns_by_columnsnames(df_cleaned,['user_id','trackable_name'])
    
    new_order = ['numeric_user_id', 'age', 'sex','country','checkin_date','numeric_trackable_name','trackable_value']
    df_c = df_c.reindex(columns=new_order)

    return df_c, mapping_matrix_id, mapping_matrix_trackable

In [158]:
raw_data = proc.load_csv(FILENAME)
df, mapping_matrix_id, mapping_matrix_trackable = clean_df(raw_data.copy())

print("Percent of saved rows = {percent:.2f}% or {num:d} rows".format(percent=df.shape[0]*100/raw_data.shape[0], num=df.shape[0]))

Percent of saved rows = 74.07% or 823271 rows
