In [1]:
import great_expectations as gx
from great_expectations.checkpoint import Checkpoint
import pandas as pd
import os
import csv
import random
import great_expectations as gx
from great_expectations.checkpoint import Checkpoint
import shutil

# Generating test files

First, we will generate the csv files. You can skip the first two cells if you already have the 'walmart_sales.csv' file in your data folder. 

In [3]:
features = pd.read_csv("../../group_project/features.csv")
stores = pd.read_csv("../../group_project/stores.csv")
df = pd.read_csv("../../group_project/train.csv")

In [4]:
data = pd.merge(df.drop(columns=['IsHoliday']), features, on=['Store', 'Date'])
full_data = pd.merge(data, stores, on=['Store'])
output_path = '../data/full_data/walmart_sales.csv'
# full_data.to_csv(output_path, index=False)

This function generates a train and test set from the walmart_sales.csv file:

In [9]:
def split_file_train_test(input_file, train_ratio=0.8):

    with open(input_file, 'r') as f:

        column_titles = f.readline().strip().split(',')
        content = f.read()
        file_size = len(content)
        train_size = int(file_size * train_ratio)

        # Create directory to store the split files
        output_dir = '../data/full_data'
        os.makedirs(output_dir, exist_ok=True)

        # Write train and test parts into separate CSV files
        train_content = content[:train_size]
        test_content = content[train_size:]
        train_filename = os.path.join(output_dir, 'train.csv')
        test_filename = os.path.join(output_dir, 'test.csv')

        # Write train CSV file
        with open(train_filename, 'w', newline='') as train_file:
            writer = csv.writer(train_file)
            writer.writerow(column_titles)
            train_file.write(train_content)

        # Write test CSV file
        with open(test_filename, 'w', newline='') as test_file:
            writer = csv.writer(test_file)
            writer.writerow(column_titles)
            test_file.write(test_content)

        print('Train and test files created successfully.')



In [11]:
split_file_train_test('../data/full_data/walmart_sales.csv', train_ratio=0.8)

Train and test files created successfully.


We will use the train.csv file to train the model. Now, we can fragment the test file into multiple files:

In [16]:
def split_file(input_file, num_parts):
    
    with open(input_file, 'r') as f:

        column_titles = f.readline().strip().split(',')
        content = f.read()
        file_size = len(content)
        part_size = file_size // num_parts

        # Create directory to store files
        output_dir = '../data/raw_data'
        os.makedirs(output_dir, exist_ok=True)

        # Write each part into separate CSV files
        for i in range(num_parts):
            start_index = i * part_size
            end_index = start_index + part_size
            if i == num_parts - 1:  # Last part might be larger if file_size is not divisible by num_parts
                end_index = file_size
            
            part_content = content[start_index:end_index]
            part_filename = os.path.join(output_dir, f'test_part_{i + 1}.csv')
            with open(part_filename, 'wb') as part_file:
                part_file.write(','.join(column_titles).encode('utf-8') + b'\n')
                part_file.write(part_content.encode('utf-8'))

        print(f'{num_parts} parts created successfully.')


In [18]:
# chekcing number of rows in test set
test_data = pd.read_csv('../data/full_data/test.csv')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85330 entries, 0 to 85329
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         85330 non-null  int64  
 1   Dept          85329 non-null  float64
 2   Date          85329 non-null  object 
 3   Weekly_Sales  85329 non-null  float64
 4   Temperature   85329 non-null  float64
 5   Fuel_Price    85329 non-null  float64
 6   MarkDown1     32340 non-null  float64
 7   MarkDown2     18477 non-null  float64
 8   MarkDown3     27956 non-null  object 
 9   MarkDown4     20878 non-null  object 
 10  MarkDown5     32894 non-null  float64
 11  CPI           85329 non-null  float64
 12  Unemployment  85329 non-null  float64
 13  IsHoliday     85329 non-null  object 
 14  Type          85329 non-null  object 
 15  Size          85329 non-null  float64
dtypes: float64(10), int64(1), object(5)
memory usage: 10.4+ MB


In [19]:
# creating a 100 files
split_file('../data/full_data/test.csv', 100)

100 parts created successfully.


# File Data Validation

Randomly selecting a file and delete it:

In [23]:
directory = '../data/raw_data'
print(random.choice(os.listdir(directory)))

test_part_38.csv


In [24]:
print(random.choice(os.listdir(directory)))

test_part_4.csv


In [25]:
random_file = random.choice(os.listdir(directory))
print(random_file)

test_part_23.csv


In [29]:
file_path = directory + "/" + random_file
file_path

'../data/raw_data/test_part_23.csv'

In [31]:
if os.path.isfile(file_path):
    os.remove(file_path)
    print("File deleted successfully")
else:
    print("Error: %s file not found" % file_path)

File deleted successfully


Applying data validation technique inside previous loop:

In [32]:
directory = '../data/raw_data'
random_file = random.choice(os.listdir(directory))
file_path = directory + "/" + random_file

# creating a context
context = gx.get_context()
validator = context.sources.pandas_default.read_csv(file_path)

# creating two expectation and saving them to the validator
validator.expect_table_columns_to_match_ordered_list(["Store","Dept","Date","Weekly_Sales","Temperature","Fuel_Price",
                                                    "MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5","CPI",
                                                    "Unemployment","IsHoliday","Type","Size"])
validator.expect_column_values_to_not_be_null("Date")
validator.save_expectation_suite()

# creating a checkpoint to 
checkpoint = context.add_or_update_checkpoint(
    name="my_test_checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

In [33]:
checkpoint_result

{
  "run_id": {
    "run_name": null,
    "run_time": "2024-03-22T16:44:45.227389+01:00"
  },
  "run_results": {
    "ValidationResultIdentifier::default/__none__/20240322T154445.227389Z/default_pandas_datasource-#ephemeral_pandas_asset": {
      "validation_result": {
        "success": true,
        "results": [
          {
            "success": true,
            "expectation_config": {
              "expectation_type": "expect_table_columns_to_match_ordered_list",
              "kwargs": {
                "column_list": [
                  "Store",
                  "Dept",
                  "Date",
                  "Weekly_Sales",
                  "Temperature",
                  "Fuel_Price",
                  "MarkDown1",
                  "MarkDown2",
                  "MarkDown3",
                  "MarkDown4",
                  "MarkDown5",
                  "CPI",
                  "Unemployment",
                  "IsHoliday",
                  "Type",
                  "

In [37]:
validation_result = checkpoint_result['run_results']
print(validation_result.keys())

dict_keys([ValidationResultIdentifier::default/__none__/20240322T154445.227389Z/default_pandas_datasource-#ephemeral_pandas_asset])


In [39]:
# Access the key directly from the dictionary
validation_key = list(validation_result.keys())[0]  # Get the first (and only) key
statistics = validation_result[validation_key]['validation_result']['statistics']

# Now you can proceed with extracting the statistics as before
evaluated_expectations = statistics['evaluated_expectations']
successful_expectations = statistics['successful_expectations']
unsuccessful_expectations = statistics['unsuccessful_expectations']
success_percent = statistics['success_percent']

# Print the summary
print(f"Total expectations evaluated: {evaluated_expectations}")
print(f"Successful expectations: {successful_expectations}")
print(f"Unsuccessful expectations: {unsuccessful_expectations}")
print(f"Success percentage: {success_percent}%")

Total expectations evaluated: 2
Successful expectations: 2
Unsuccessful expectations: 0
Success percentage: 100.0%


In [5]:
# 1. Randomly selecting the file
directory = '../data/raw_data'
random_file = random.choice(os.listdir(directory))
file_path = directory + "/" + random_file
print(file_path)

# 2. Running the data validation job
context = gx.get_context()
validator = context.sources.pandas_default.read_csv(file_path)

# creating two expectation and saving them to the validator
validator.expect_table_columns_to_match_ordered_list(["Store","Dept","Date","Weekly_Sales","Temperature","Fuel_Price",
                                                    "MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5","CPI",
                                                    "Unemployment","IsHoliday","Type","Size"])
validator.expect_column_values_to_not_be_null("Date")
validator.save_expectation_suite()

checkpoint = context.add_or_update_checkpoint(
    name="my_test_checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()

validation_result = checkpoint_result['run_results']

# Access the key directly from the dictionary
validation_key = list(validation_result.keys())[0]
statistics = validation_result[validation_key]['validation_result']['statistics']

# extracting the statistics
evaluated_expectations = statistics['evaluated_expectations']
successful_expectations = statistics['successful_expectations']
unsuccessful_expectations = statistics['unsuccessful_expectations']
success_percent = statistics['success_percent']

validation_result = checkpoint_result['run_results'][validation_key]['validation_result']
results = validation_result['results']


# 3. Data Ingestion based on quality
good_data_directory = '../data/good_data'
bad_data_directory = '../data/bad_data'
unexpected_rows = set()
unexpected_values = set()

if success_percent == 100.0:
    shutil.move(file_path, os.path.join(good_data_directory, os.path.basename(file_path)))
elif success_percent == 0.0:
    shutil.move(file_path, os.path.join(bad_data_directory, os.path.basename(file_path)))
else:
    for result in results:
    # Check if the expectation failed
        if not result['success']:

            # Get the expectation type
            expectation_type = result['expectation_config']['expectation_type']
            column_name = result['expectation_config']['kwargs']['column']

            # Get the unexpected values and corresponding rows
            #unexpected_values = result['result']['partial_unexpected_list']
            #unexpected_rows = result['result']['partial_unexpected_index_list']
            unexpected_values.update(result['result']['partial_unexpected_list'])
            unexpected_rows.update(result['result']['partial_unexpected_index_list'])


../data/raw_data/test_part_40.csv


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
print(unexpected_rows)

set()


In [3]:
context.view_validation_result(checkpoint_result)