In [3]:
import pandas as pd
import warnings
import os
import csv
import numpy as np
warnings.filterwarnings("ignore", category=FutureWarning)

In [4]:
df = pd.read_csv("../data/full_data/walmart_sales.csv")
df.head(5)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Type,Size
0,1,1,2010-02-05,24924.5,42.31,2.572,,,,,,211.096358,8.106,False,A,151315
1,1,2,2010-02-05,50605.27,42.31,2.572,,,,,,211.096358,8.106,False,A,151315
2,1,3,2010-02-05,13740.12,42.31,2.572,,,,,,211.096358,8.106,False,A,151315
3,1,4,2010-02-05,39954.04,42.31,2.572,,,,,,211.096358,8.106,False,A,151315
4,1,5,2010-02-05,32229.38,42.31,2.572,,,,,,211.096358,8.106,False,A,151315


In [5]:
def split_file(input_file: str, output_dir: str, num_parts: int) -> None:
    with open(input_file, 'r') as f:
        column_titles = f.readline().strip().split(',')
        content = f.readlines()[1:] 
        file_size = len(content)
        part_size = file_size // num_parts

        os.makedirs(output_dir, exist_ok=True)

        for i in range(num_parts):
            start_index = i * part_size
            end_index = start_index + part_size
            if i == num_parts - 1:
                end_index = file_size
            
            part_content = content[start_index:end_index]
            part_filename = os.path.join(output_dir, f'test_part_{i + 1}.csv')
            with open(part_filename, 'w', newline='') as part_file:
                part_file.write(','.join(column_titles[:3] + column_titles[4:]) + '\n')
                
                for line in part_content:
                    values = line.strip().split(',')
                    corrected_values = values[:3] + values[4:]
                    part_file.write(','.join(corrected_values) + '\n')

    print(f'{num_parts} parts created successfully.')


In [6]:
split_file(input_file='../data/full_data/walmart_sales.csv', output_dir='../airflow-docker/dags/raw_data', num_parts=500)

500 parts created successfully.


In [7]:
def delete_data(data_directory: str, output_dir: str, ratio: float) -> None:
    
    # issue 1: shuffle columns
    for i in range(20):
        random_file = np.random.choice(os.listdir(data_directory))
        file_path = os.path.join(data_directory, random_file)

        df = pd.read_csv(file_path)

        column_names = df.columns.tolist()
        shuffled_column_names = np.random.permutation(column_names)
        shuffled_df = df[shuffled_column_names]
        shuffled_df.to_csv(f'{output_dir}/shuffled_partition{i}.csv', index=False)
        print("Shuffled dataset succesfully and saved as csv")

    i = 0
    for file in os.listdir(data_directory):
        file_path = os.path.join(data_directory, file)
        
        if not file.endswith('.csv'):
            continue
        
        df = pd.read_csv(file_path)

        gibberish = ["Monday is sunny", "no sales today", "economic crash", " ", "no work today"]
        random_number = np.random.randint(len(df), size=len(df))
        negative_numbers = np.arange(-100, 0)
        cells_to_delete = int(df.size * ratio)
        cells_to_delete_indices = np.random.choice(df.size, cells_to_delete, replace=False)

        # issue 2: missing data
        for cell_index in range(cells_to_delete):
            random_cell_index = cells_to_delete_indices[cell_index]
            row_index, col_index = np.unravel_index(random_cell_index, df.shape)
            column_name = np.random.choice(df.columns)
            df.at[row_index, column_name] = np.nan
        
        # issue 3: replace text with integer
        column_name = 'IsHoliday'
        cells_to_change_to_integer = np.random.choice(df.index, cells_to_delete // 2, replace=False)
        df.loc[cells_to_change_to_integer, column_name] = np.random.choice(random_number)
        
        # issue 4: replace int or float with text
        column_name = 'CPI'
        cells_to_change_to_integer = np.random.choice(df.index, cells_to_delete // 2, replace=False)
        df.loc[cells_to_change_to_integer, column_name] = np.random.choice(gibberish)

        # issue 5: unexpected value in a column
        column_name = 'Type'
        cells_to_change_to_integer = np.random.choice(df.index, cells_to_delete // 2, replace=False)
        df.loc[cells_to_change_to_integer, column_name] = np.random.choice(gibberish)

        # issue 6: negative number in a column:
        column_name = 'Size'
        cells_to_change_to_integer = np.random.choice(df.index, cells_to_delete // 2, replace=False)
        df.loc[cells_to_change_to_integer, column_name] = np.random.choice(negative_numbers)

        # Save
        df.to_csv(f"{output_dir}/partition_{i}.csv", encoding="utf-8", index=False)
        print(f"CSV file 'partition_{i}.csv' has been successfully generated")
        i += 1
    
    # issue 7: missing column
    for i in range(5):
        random_file = np.random.choice(os.listdir(data_directory))
        file_path = os.path.join(data_directory, random_file)

        df = pd.read_csv(file_path)
        column_names = df.columns.tolist()
        random_column = np.random.choice(column_names)
        df_missing_column = df.drop(columns=random_column)
        df_missing_column.to_csv(f'{output_dir}/partition_short{i}.csv', index=False)
        print("Dropped column succesfully and saved dataset as csv")

In [8]:
delete_data(data_directory='../airflow-docker/dags/raw_data', output_dir='../airflow-docker/dags/corrupted_data', ratio=0.01)

Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
Shuffled dataset succesfully and saved as csv
CSV file 'partition_0.csv' has been successfully generated
CSV file 'partition_1