In [1]:
import csv
import os

import pandas as pd

from utils_part2 import clean_csv, csv_remove_duplicates

## Configurations

In [2]:
# input_file_path = os.path.join('.','input_data','algorithms part dataset.csv')
input_file_path = os.path.join('.','input_data','algorithms part dataset smpl 50k.csv')
# input_file_path = os.path.join('..', 'input_data', 'smpl.csv')
# input_file_path = os.path.join('input_data', 'smpl_read.csv')

clean_file = True

In [3]:
if clean_file:
    clean_csv(input_file_path)
    input_file_path = input_file_path.replace('.csv', '_cln.csv')
    
try:
    pd.read_csv(input_file_path)
except ParserError:
    print('Error parsing csv file')


In [4]:
output_file_path = os.path.split(input_file_path)[-1].replace('.csv', '_output.csv')

In [5]:
pd.read_csv(input_file_path)

Unnamed: 0,ID,GAME_NAME,BEHAVIOUR,PLAY_PURCHASE,NONE
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0
...,...,...,...,...,...
49995,221432905,Gear Up,purchase,1.0,0
49996,178675666,Dota 2,purchase,1.0,0
49997,178675666,Dota 2,play,58.0,0
49998,160310816,Dota 2,purchase,1.0,0


In [6]:
# Run unit tests
!pytest

platform win32 -- Python 3.9.12, pytest-7.1.1, pluggy-1.0.0
rootdir: C:\Users\jpicao\git_repos\interview_case_studies\part2
plugins: anyio-3.5.0
collected 2 items

test_utils_part2.py ..                                                   [100%]



# Algorithm

The bottleneck of the current implementation is searching each new value in the unique values list using a linear search method with complexity O(n): `x in list` has time complexity O(n), has expected, but I have used this [source](https://wiki.python.org/moin/TimeComplexity).
Evident improvement could be either:
- to use an hash table to record the elements already present in the list;
- order the unique values list (although this would have an impact on the output file);

In [7]:
input_file_path

'.\\input_data\\algorithms part dataset smpl 50k_cln.csv'

# Examples

In [8]:
n_rows, n_dups, dups_idx = csv_remove_duplicates(input_file_path,
                                                 output_file_path)

In [9]:
df_actual = pd.read_csv(output_file_path)
df_expected = pd.read_csv(input_file_path).drop_duplicates().reset_index(drop=True)

In [10]:
pd.testing.assert_frame_equal(df_actual, df_expected)

In [11]:
df_actual

Unnamed: 0,ID,GAME_NAME,BEHAVIOUR,PLAY_PURCHASE,NONE
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0
...,...,...,...,...,...
49852,221432905,Gear Up,purchase,1.0,0
49853,178675666,Dota 2,purchase,1.0,0
49854,178675666,Dota 2,play,58.0,0
49855,160310816,Dota 2,purchase,1.0,0


In [12]:
df_expected

Unnamed: 0,ID,GAME_NAME,BEHAVIOUR,PLAY_PURCHASE,NONE
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0
...,...,...,...,...,...
49852,221432905,Gear Up,purchase,1.0,0
49853,178675666,Dota 2,purchase,1.0,0
49854,178675666,Dota 2,play,58.0,0
49855,160310816,Dota 2,purchase,1.0,0
