diff --git a/buffalo/data/fileio.hpp b/buffalo/data/fileio.hpp index ad57cff..49b2da7 100644 --- a/buffalo/data/fileio.hpp +++ b/buffalo/data/fileio.hpp @@ -322,7 +322,7 @@ vector _sort_and_compressed_binarization( records.insert(end(records), begin(v), end_it); } - assert(records.size == total_lines); + assert(records.size() == total_lines); omp_set_num_threads(num_workers); diff --git a/buffalo/data/stream.py b/buffalo/data/stream.py index 3cdd283..3dc1fdd 100644 --- a/buffalo/data/stream.py +++ b/buffalo/data/stream.py @@ -105,6 +105,8 @@ def get_max_column_length(fname): with open(main_path) as fin: for line in log.ProgressBar(level=log.DEBUG, iterable=fin): data = line.strip().split() + if not data: + continue if not iid_path: itemids |= set(data) @@ -246,7 +248,7 @@ def _create_working_data(self, db, stream_main_path, itemids, for col in train_data: w.write(f"{user} {col} 1\n") for col in vali_data: - vali_lines.append(f"{user} {col} {val}") + vali_lines.append(f"{user} {col} 1") else: for col, val in Counter(train_data).items(): w.write(f"{user} {col} {val}\n")