In [1]:
import os
import csv

import pandas as pd
import json

from utils_part2 import clean_csv, Node

## Configurations

In [2]:
# input_file_path = os.path.join('.','input_data','algorithms part dataset.csv')
input_file_path = os.path.join('.','input_data','algorithms part dataset smpl 10k.csv')
# input_file_path = os.path.join('input_data', 'smpl.csv')

clean_file = True

In [3]:
if clean_file:
    clean_csv(input_file_path)
    input_file_path = input_file_path.replace('.csv', '_cln.csv')
    
try:
    pd.read_csv(input_file_path)
except ParserError:
    print('Error parsing csv file')

In [4]:
if os.path.getsize(input_file_path) < 1000:
    run_tree_line_by_line = True
else:
    run_tree_line_by_line = False

In [5]:
# Run unit tests
!pytest

platform win32 -- Python 3.9.12, pytest-7.1.1, pluggy-1.0.0
rootdir: C:\Users\jpicao\git_repos\interview_case_studies\part2
plugins: anyio-3.5.0
collected 2 items

test_utils_part2.py ..                                                   [100%]



# `Node()` class description

## `Node()` class attributes

- `self.value`: The actual data or value in the instance. In this exercise this attribute will contain the values coming from the provided csv file (except the column names, which define the node type).
- `self.node_type`: Describes the type of information that the instance represents. In this exercise, the node_types will be the different columns in the provided csv file. 
- `self.children`: A list containing the instance children. Children are also instances of type `Node()`.
- `self.count_rows`: An integer counting the number of table rows that are represented by the sub-tree starting in the current node. If `self` is a leaf node, then `self.count_rows` is used to count the number of duplicates (i.e., `self.count_rows > 1` means duplicates were found), otherwise it is used mainly for internal validation of the tree structure.
- `self.node_hierarchy`: Defines the children node hierarchy levels of the sub-tree starting in the current node. For example, in this exercise the root node will have `self.node_hierarchy = ['ID', 'GAME_NAME', 'BEHAVIOUR', 'PLAY_PURCHASE', 'NONE]`. It is an unnecessary attribute that over complicates the class and quadratically increases the memory needed to store the tree, but I didn't have time to refactor.

## `Node()` class attributes

- `self.add_row()`: Inserts a single table row to the tree structure.
- `self.add_table()`: Loops over the table rows and calls `self.add_row()` for each row, thus converting a complete table into a tree structure at once.
- `self.print_tree()`: Prints the tree structure to stdout.
- `self.print_table()`: Converts a tree structure into a table and prints the result. If the parameter `to` is a string, then prints the resulting table to the file path defined by `to`. The parameter `duplicates` has three possible configurations: *ignore* to leave the table as is; *drop* to drop duplicates; and *find* to return duplicates only.
- `self.tree_to_df()`: Convert the tree to a dataframe. Calls `self.print_table()` to a temporary file and reads the resulting file with pandas.read_csv().

# Examples

## Add table line by line

In [6]:
df_input = pd.read_csv(input_file_path)

In [7]:
df_input

Unnamed: 0,ID,GAME_NAME,BEHAVIOUR,PLAY_PURCHASE,NONE
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0
...,...,...,...,...,...
9995,293691139,Moonbase Alpha,purchase,1.0,0
9996,293691139,Defiance,purchase,1.0,0
9997,293691139,Warframe,purchase,1.0,0
9998,293269191,Counter-Strike Global Offensive,purchase,1.0,0


In [8]:
try:
    del tree
except NameError:
    pass

tree = Node(df_input.columns.to_list())

Tree was initialized with `self.node_type` and `seld.node_hierarchy` but the remaining attributes are empty.

In [9]:
tree.node_type

'root'

In [10]:
tree.node_hierarchy

['ID', 'GAME_NAME', 'BEHAVIOUR', 'PLAY_PURCHASE', 'NONE']

In [11]:
tree.children

[]

In [12]:
tree.value

In [13]:
tree.count_rows

0

In [14]:
tree.print_tree()

 | root | None | 0


In [15]:
# Create tree line by line for ilustration only.
# If table size is greater than 1 KB  skip.
if run_tree_line_by_line:
    for index, row in df_input.iterrows():
        print(f"Add rom from idx {index}")
        tree.add_row(row)    
        tree.print_tree()

### Validate the results

#### Validate tree structure

The data must remain the same when converted to a tree a converted back to a table.

In [16]:
df_from_tree = tree.tree_to_df(duplicates ='ignore')

In [17]:
if run_tree_line_by_line:
    actual_result = (df_from_tree
                     .sort_values(by=df_from_tree.columns.to_list())
                     .reset_index(drop=True))

    expected_result = (df_input
                       .sort_values(by=df_input.columns.to_list())
                       .reset_index(drop=True))

    pd.testing.assert_frame_equal(actual_result, expected_result,
                                  check_column_type=False, 
                                  check_index_type=False, 
                                  check_dtype=False)

#### Validate duplicates find

Compare duplicates detection with pandas.DataFrame.duplicated().

In [18]:
df_dups_from_tree = tree.tree_to_df(duplicates ='find')

In [19]:
if run_tree_line_by_line:
    actual_result = (df_dups_from_tree
                     .sort_values(by=df_dups_from_tree.columns.to_list())
                     .reset_index(drop=True))

    expected_result = (df_input.loc[df_input.duplicated(),:]
                       .sort_values(by=df_dups_from_tree.columns.to_list())
                       .reset_index(drop=True))

    pd.testing.assert_frame_equal(actual_result, expected_result,
                                  check_column_type=False, 
                                  check_index_type=False, 
                                  check_dtype=False)

#### Validate duplicates drop

Compare duplicates detection with pandas.DataFrame.duplicated().

In [20]:
df_drop_from_tree = tree.tree_to_df(duplicates ='drop')

In [21]:
if run_tree_line_by_line:
    actual_result = (df_drop_from_tree
                     .sort_values(by=df_dups_from_tree.columns.to_list())
                     .reset_index(drop=True))

    expected_result = (df_input.drop_duplicates()
                       .sort_values(by=df_dups_from_tree.columns.to_list())
                       .reset_index(drop=True))

    pd.testing.assert_frame_equal(actual_result, expected_result,
                                  check_column_type=False, 
                                  check_index_type=False, 
                                  check_dtype=False)

## At table at once

In [22]:
df_input

Unnamed: 0,ID,GAME_NAME,BEHAVIOUR,PLAY_PURCHASE,NONE
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0
...,...,...,...,...,...
9995,293691139,Moonbase Alpha,purchase,1.0,0
9996,293691139,Defiance,purchase,1.0,0
9997,293691139,Warframe,purchase,1.0,0
9998,293269191,Counter-Strike Global Offensive,purchase,1.0,0


Create tree structure from table.

In [23]:
try:
    del tree
except NameError:
    pass

tree = Node(df_input.columns.to_list())
tree.add_table(df_input)

### Validate the results

#### Validate tree structure

The data must remain the same when converted to a tree a converted back to a table.

In [24]:
df_from_tree = tree.tree_to_df(duplicates ='ignore')

In [25]:
actual_result = (df_from_tree
                 .sort_values(by=df_from_tree.columns.to_list())
                 .reset_index(drop=True))

expected_result = (df_input
                   .sort_values(by=df_input.columns.to_list())
                   .reset_index(drop=True))

pd.testing.assert_frame_equal(actual_result, expected_result)

#### Validate duplicates

Compare duplicates detection with pandas.DataFrame.duplicated().

In [26]:
df_dups_from_tree = tree.tree_to_df(duplicates ='find')

In [27]:
actual_result = (df_dups_from_tree
                 .sort_values(by=df_dups_from_tree.columns.to_list())
                 .reset_index(drop=True))

expected_result = (df_input.loc[df_input.duplicated(),:]
                   .sort_values(by=df_dups_from_tree.columns.to_list())
                   .reset_index(drop=True))

pd.testing.assert_frame_equal(actual_result, expected_result,
                              check_column_type=False, 
                              check_index_type=False, 
                              check_dtype=False)

#### Validate duplicates drop

Compare duplicates detection with pandas.DataFrame.duplicated().

In [28]:
df_drop_from_tree = tree.tree_to_df(duplicates ='drop')

In [29]:
actual_result = (df_drop_from_tree
                 .sort_values(by=df_dups_from_tree.columns.to_list())
                 .reset_index(drop=True))

expected_result = (df_input.drop_duplicates()
                   .sort_values(by=df_dups_from_tree.columns.to_list())
                   .reset_index(drop=True))

pd.testing.assert_frame_equal(actual_result, expected_result,
                              check_column_type=False, 
                              check_index_type=False, 
                              check_dtype=False)

In [30]:
# try:
#     del tree
# except NameError:
#     pass