# Check Original Data CV

In [1]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

DIR_DATA_RAW = '../data/raw'
DIR_DATA_INTERIM = '../data/interim'

In [2]:
def get_filepaths(directory):
    """
    This function will generate the file names in a directory 
    tree by walking the tree either top-down or bottom-up. For each 
    directory in the tree rooted at directory top (including top itself), 
    it yields a 3-tuple (dirpath, dirnames, filenames).
    """
    file_paths = [] 

    for root, directories, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)

    return file_paths 

def load_data(DIR_DATA_RAW, start, end):
    """Load data and label from the given directory path
    
    **Args**:
    * DIR_DATA_RAW (string): Path to the raw directory
    
    **Return**:
    * data (dataframe) 
    * label (dataframe)
    """
    file_paths = get_filepaths(DIR_DATA_RAW)
    file_paths = [path for path in file_paths if (path.endswith("Inputs") or path.endswith("Targets"))]
    file_paths.sort()
    
    print("Loading data from: ")
    for x in file_paths[start:end]:
        print(x)
        
    data = pd.DataFrame()
    label = pd.DataFrame()

    for i in range(0, len(file_paths[:4]), 2):
        cur_data = pd.read_csv(file_paths[i])
        cur_label = pd.read_csv(file_paths[i+1], header=None)

        data = pd.concat([data, cur_data])
        label = pd.concat([label, cur_label])
        
    label.columns = ['label']
    data = data.reset_index(drop=True)
    label = label.reset_index(drop=True)
    return data, label

def remove_inconsistent_and_duplicates(data, label):
    """ Remove inconsistent and duplicated data
        Keeping the first occurence
    """
    data = data.copy()
    label = label.copy()
    data['label'] = label['label']

    check_duplicate = data.duplicated(keep='first')
    data = data.loc[~check_duplicate]
    label = label.loc[~check_duplicate]
    data.drop('label', axis=1, inplace=True)

    check_duplicate = data.duplicated(keep=False)
    data = data.loc[~check_duplicate]
    label = label.loc[~check_duplicate]

    return data, label

In [3]:
data1, label1 = load_data(DIR_DATA_RAW, 0, 4)
data1, label1 = remove_inconsistent_and_duplicates(data1, label1)

Loading data from: 
../data/raw/DataminingContest2009.Task1.CV1.Test.Inputs
../data/raw/DataminingContest2009.Task1.CV1.Test.Targets
../data/raw/DataminingContest2009.Task1.CV1.Train.Inputs
../data/raw/DataminingContest2009.Task1.CV1.Train.Targets


In [4]:
data2, label2 = load_data(DIR_DATA_RAW, 4, 8)
data2, label2 = remove_inconsistent_and_duplicates(data2, label2)

Loading data from: 
../data/raw/DataminingContest2009.Task1.CV10.Test.Inputs
../data/raw/DataminingContest2009.Task1.CV10.Test.Targets
../data/raw/DataminingContest2009.Task1.CV10.Train.Inputs
../data/raw/DataminingContest2009.Task1.CV10.Train.Targets


In [5]:
data1.equals(data2)

True

In [6]:
label1.equals(label2)

True

# Double check, Combine Data1 & Data2 then remove duplicate

In [7]:
data3 = pd.concat([data1, data2])
label3 = pd.concat([label1, label2])

In [8]:
print(data1.shape)
print(data2.shape)
print(data3.shape)
print(data3.duplicated().sum())

(87429, 19)
(87429, 19)
(174858, 19)
87429


In [9]:
data3, label3 = remove_inconsistent_and_duplicates(data3, label3)

In [10]:
data3.shape

(87429, 19)

In [11]:
data3.equals(data1)

True