# Beginning Steps

## Detect Encoding Using chardet

In [None]:
import chardet
import pandas as pd

# Detect encoding
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read(100000)  # Read a sample from the file
        result = chardet.detect(raw_data)
    return result['encoding']

file_path = "/content/retail_data.csv"
detected_encoding = detect_encoding(file_path)
print(f"Detected encoding: {detected_encoding}")

# Load CSV with detected encoding
data = pd.read_csv(file_path, encoding=detected_encoding)
print(data.head())  # Display first few rows


Detected encoding: ascii
   customer_id  age  gender income_bracket loyalty_program  membership_years  \
0            1   56   Other           High              No                 0   
1            2   69  Female         Medium              No                 2   
2            3   46  Female            Low              No                 5   
3            4   32  Female            Low              No                 0   
4            5   60  Female            Low             Yes                 7   

  churned marital_status  number_of_children education_level  ...  \
0      No       Divorced                   3      Bachelor's  ...   
1      No        Married                   2             PhD  ...   
2      No        Married                   3      Bachelor's  ...   
3      No       Divorced                   2        Master's  ...   
4     Yes       Divorced                   2      Bachelor's  ...   

  distance_to_store  holiday_season  season  weekend customer_support_calls  \


## Importing Original csv File w/ correct Encoding

In [None]:
from pandas import read_csv
import pandas as pd

# Specify the encoding as detected ('ascii')
data = pd.read_csv('/content/retail_data.csv', encoding='ascii')  # Updated encoding to 'ascii'

# Sampling to 30,000 Records

## Simple Random Sampling

In [None]:
import pandas as pd

# Check if data has more than or equal to 20000 rows
sample_size = min(30000, len(data))  # Sample 20,000 rows

# Sample the data
sampled_data = data.sample(n=sample_size, random_state=42)

# Display the sampled data
print(sampled_data)

sampled_data.to_csv("Simple_Random_Sample.csv", index=False)

        customer_id  age  gender income_bracket loyalty_program  \
987231       987232   60  Female         Medium             Yes   
79954         79955   63   Other            Low              No   
567130       567131   72   Other         Medium             Yes   
500891       500892   34   Other           High             Yes   
55399         55400   77   Other         Medium              No   
...             ...  ...     ...            ...             ...   
493666       493667   25   Other           High              No   
356313       356314   27    Male           High             Yes   
94438         94439   20    Male            Low             Yes   
763504       763505   58  Female            Low             Yes   
940706       940707   21    Male           High              No   

        membership_years churned marital_status  number_of_children  \
987231                 8     Yes       Divorced                   3   
79954                  1     Yes         Single      

### Statistical Verification - KS-Test & Chi-Square Test

#### KS-Test (Numeric Attributes)

In [None]:
import numpy as np
from scipy.stats import ks_2samp

# Select only numeric columns
numeric_columns = data.select_dtypes(include=np.number).columns

# Perform KS Test for each numeric column
ks_results = {}
for col in numeric_columns:
    stat, p_value = ks_2samp(data[col].dropna(), sampled_data[col].dropna())
    ks_results[col] = {'KS Statistic': stat, 'p-value': p_value}

# Convert to DataFrame for better readability
ks_df = pd.DataFrame.from_dict(ks_results, orient='index')
print("Kolmogorov-Smirnov Test Results:")
print(ks_df)

# Save KS results to CSV
ks_df.to_csv("SRS-KS_Test_Results.csv", index=True)


Kolmogorov-Smirnov Test Results:
                           KS Statistic   p-value
customer_id                    0.004593  0.568914
age                            0.003381  0.891954
membership_years               0.003544  0.856582
number_of_children             0.004757  0.523432
transaction_id                 0.003913  0.762287
product_id                     0.005213  0.405778
quantity                       0.001768  0.999988
unit_price                     0.006433  0.178569
discount_applied               0.004676  0.545683
transaction_hour               0.004747  0.526079
week_of_year                   0.006172  0.216210
month_of_year                  0.003848  0.780102
avg_purchase_value             0.004737  0.528916
avg_discount_used              0.006448  0.176617
online_purchases               0.003714  0.815194
in_store_purchases             0.003256  0.915980
avg_items_per_transaction      0.003233  0.920051
avg_transaction_value          0.007887  0.053110
total_returned_it

#### Chi-Square (Categorical Attributes)

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Select categorical columns
categorical_columns = data.select_dtypes(include='object').columns

chi2_results = {}

for col in categorical_columns:
    # Convert categorical values to frequency counts
    observed = data[col].value_counts().values
    expected = sampled_data[col].value_counts().reindex(data[col].unique(), fill_value=0).values

    # Perform Chi-Square Test
    chi2_stat, p_value, _, _ = chi2_contingency([observed, expected])

    # Store results
    chi2_results[col] = {'Chi-Square Statistic': chi2_stat, 'p-value': p_value}

# Convert results to a DataFrame
chi2_df = pd.DataFrame.from_dict(chi2_results, orient='index')

# Print results
print("\nChi-Square Test Results:")
print(chi2_df)

# Save results to CSV
chi2_df.to_csv("SRS-Chi_Square_Test_Results.csv", index=True)


Chi-Square Test Results:
                           Chi-Square Statistic   p-value
gender                                 0.016375  0.991846
income_bracket                         3.197016  0.202198
loyalty_program                        0.750164  0.386424
churned                                1.409705  0.235105
marital_status                         1.688061  0.429974
education_level                        4.821439  0.185349
occupation                             1.613442  0.656346
transaction_date                  498321.124167  1.000000
product_category                       9.950684  0.041267
payment_method                         4.110562  0.249770
store_location                         0.686004  0.876491
day_of_week                           11.747071  0.067855
purchase_frequency                     0.174906  0.981535
last_purchase_date                496856.492778  1.000000
preferred_store                        0.901380  0.825095
product_name                           5.65478

## 1-in-k Sampling

In [None]:
import pandas as pd

# Define sample size
sample_size = 30000

# Compute sampling interval K
K = len(data) // sample_size

# Perform 1-in-K systematic sampling
systematic_sampled_data = data.iloc[::K, :]

# Ensure exactly 30,000 rows (adjust if needed)
systematic_sampled_data = systematic_sampled_data.head(sample_size)

# Display the sampled data
print(systematic_sampled_data)

systematic_sampled_data.to_csv("Systematic_Sample.csv", index=False)

        customer_id  age  gender income_bracket loyalty_program  \
0                 1   56   Other           High              No   
33               34   39  Female           High             Yes   
66               67   67   Other           High             Yes   
99              100   51    Male            Low             Yes   
132             133   42    Male            Low             Yes   
...             ...  ...     ...            ...             ...   
989835       989836   47  Female         Medium              No   
989868       989869   79   Other         Medium             Yes   
989901       989902   60  Female            Low              No   
989934       989935   41    Male         Medium              No   
989967       989968   77    Male         Medium              No   

        membership_years churned marital_status  number_of_children  \
0                      0      No       Divorced                   3   
33                     0     Yes       Divorced      

### Statistical Verification - KS-Test & Chi-Square Test

#### KS-Test (Numeric Attributes)

In [None]:
import numpy as np
from scipy.stats import ks_2samp

# Select only numeric columns
numeric_columns = data.select_dtypes(include=np.number).columns

# Perform KS Test for each numeric column
ks_results = {}
for col in numeric_columns:
    stat, p_value = ks_2samp(data[col].dropna(), systematic_sampled_data[col].dropna())
    ks_results[col] = {'KS Statistic': stat, 'p-value': p_value}

# Convert to DataFrame for better readability
ks_df = pd.DataFrame.from_dict(ks_results, orient='index')
print("Kolmogorov-Smirnov Test Results:")
print(ks_df)

# Save results as CSV
ks_df.to_csv("Systematic-KS_Test_Results.csv", index=True)


Kolmogorov-Smirnov Test Results:
                           KS Statistic   p-value
customer_id                    0.010032  0.005650
age                            0.005207  0.407141
membership_years               0.004396  0.625130
number_of_children             0.001743  0.999992
transaction_id                 0.004697  0.539857
product_id                     0.004543  0.583141
quantity                       0.001833  0.999971
unit_price                     0.003539  0.857660
discount_applied               0.006002  0.243806
transaction_hour               0.002634  0.987236
week_of_year                   0.002502  0.993055
month_of_year                  0.005368  0.369562
avg_purchase_value             0.006464  0.174463
avg_discount_used              0.002549  0.991257
online_purchases               0.006073  0.231977
in_store_purchases             0.003881  0.771013
avg_items_per_transaction      0.005384  0.366019
avg_transaction_value          0.003836  0.783228
total_returned_it

#### Chi-Square (Categorical Attributes)

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Select categorical columns
categorical_columns = data.select_dtypes(include='object').columns

chi2_results = {}

for col in categorical_columns:
    # Convert categorical values to frequency counts
    observed = data[col].value_counts().values
    expected = systematic_sampled_data[col].value_counts().reindex(data[col].unique(), fill_value=0).values

    # Perform Chi-Square Test
    chi2_stat, p_value, _, _ = chi2_contingency([observed, expected])

    # Store results
    chi2_results[col] = {'Chi-Square Statistic': chi2_stat, 'p-value': p_value}

# Convert results to a DataFrame
chi2_df = pd.DataFrame.from_dict(chi2_results, orient='index')

# Print results
print("\nChi-Square Test Results:")
print(chi2_df)

# Save results to CSV
chi2_df.to_csv("Systematic-Chi_Square_Test_Results.csv", index=True)



Chi-Square Test Results:
                           Chi-Square Statistic   p-value
gender                                 1.721521  0.422840
income_bracket                         0.507889  0.775735
loyalty_program                        0.284072  0.594045
churned                                1.277854  0.258298
marital_status                         1.174345  0.555897
education_level                        3.384102  0.336108
occupation                             4.688463  0.196083
transaction_date                  498247.450556  1.000000
product_category                       0.254819  0.992541
payment_method                         1.507888  0.680451
store_location                         2.147803  0.542303
day_of_week                            3.720410  0.714451
purchase_frequency                     2.173643  0.537159
last_purchase_date                496901.875722  1.000000
preferred_store                        3.012489  0.389704
product_name                           5.59371

## Stratified Sampling

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Define sample size
stratified_sample_size = 30000

# Perform stratified sampling using train_test_split
stratified_sample, _ = train_test_split(
    data,
    train_size=stratified_sample_size,
    stratify=data['income_bracket'],
    random_state=42
)

# Display sampled data
print(stratified_sample.head())

stratified_sample.to_csv("Stratified_Sample.csv", index=False)

        customer_id  age  gender income_bracket loyalty_program  \
862287       862288   42   Other           High              No   
980510       980511   53  Female            Low              No   
768782       768783   66   Other            Low             Yes   
874826       874827   64    Male         Medium              No   
476042       476043   59  Female           High              No   

        membership_years churned marital_status  number_of_children  \
862287                 3     Yes       Divorced                   4   
980510                 5     Yes       Divorced                   0   
768782                 0      No       Divorced                   2   
874826                 0      No        Married                   0   
476042                 8     Yes        Married                   3   

       education_level  ... distance_to_store  holiday_season  season  \
862287      Bachelor's  ...             45.18             Yes  Spring   
980510     High School  

### Statistical Verification - KS-Test & Chi-Square Test

#### KS-Test (Numeric Attributes)

In [None]:
import numpy as np
from scipy.stats import ks_2samp

# Select numeric attributes
numeric_features = data.select_dtypes(include=np.number).columns

# Perform KS test for each numeric attribute
ks_test_output = {}
for feature in numeric_features:
    ks_statistic, ks_pvalue = ks_2samp(data[feature].dropna(), stratified_sample[feature].dropna())
    ks_test_output[feature] = {'KS Statistic': ks_statistic, 'p-value': ks_pvalue}

# Convert results to DataFrame
ks_results_df = pd.DataFrame.from_dict(ks_test_output, orient='index')
print("Kolmogorov-Smirnov Test Results (Stratified Sampling):")
print(ks_results_df)

# Save KS results to CSV
ks_results_df.to_csv("Stratified-KS_Test_Results.csv", index=True)


Kolmogorov-Smirnov Test Results (Stratified Sampling):
                           KS Statistic   p-value
customer_id                    0.004267  0.662244
age                            0.005384  0.366019
membership_years               0.005289  0.387682
number_of_children             0.003490  0.868694
transaction_id                 0.008864  0.020445
product_id                     0.004269  0.661669
quantity                       0.002765  0.978714
unit_price                     0.005284  0.389008
discount_applied               0.004813  0.508290
transaction_hour               0.003319  0.904284
week_of_year                   0.003264  0.914478
month_of_year                  0.002165  0.999154
avg_purchase_value             0.003671  0.826029
avg_discount_used              0.004282  0.657927
online_purchases               0.003996  0.739315
in_store_purchases             0.005470  0.346884
avg_items_per_transaction      0.006663  0.149851
avg_transaction_value          0.004972  0.46

#### Chi-Square (Categorical Attributes)

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Select categorical attributes
categorical_features = data.select_dtypes(include='object').columns

chi_square_output = {}

for feature in categorical_features:
    # Get frequency counts for observed and expected distributions
    observed = data[feature].value_counts().values
    expected = stratified_sample[feature].value_counts().reindex(data[feature].unique(), fill_value=0).values

    # Perform Chi-Square Test
    chi_statistic, chi_pvalue, _, _ = chi2_contingency([observed, expected])

    # Store results
    chi_square_output[feature] = {'Chi-Square Statistic': chi_statistic, 'p-value': chi_pvalue}

# Convert results to DataFrame
chi_results_df = pd.DataFrame.from_dict(chi_square_output, orient='index')

# Print results
print("\nChi-Square Test Results (Stratified Sampling):")
print(chi_results_df)

# Save results to CSV
chi_results_df.to_csv("Stratified-Chi_Square_Test_Results.csv", index=True)


Chi-Square Test Results (Stratified Sampling):
                           Chi-Square Statistic   p-value
gender                                 1.875268  0.391553
income_bracket                         0.000026  0.999987
loyalty_program                        0.000092  0.992329
churned                                1.382817  0.239622
marital_status                         0.719212  0.697951
education_level                        3.124317  0.372852
occupation                             2.018005  0.568678
transaction_date                  498185.564722  1.000000
product_category                       0.833767  0.933865
payment_method                         7.267311  0.063849
store_location                         2.368688  0.499490
day_of_week                            3.820662  0.700931
purchase_frequency                     4.202355  0.240426
last_purchase_date                496829.970278  1.000000
preferred_store                        1.673182  0.642912
product_name            

# EDA

## Generating Profiling Reports for Each Dataset

### Full Dataset

In [None]:
'''
import pandas as pd
from ydata_profiling import ProfileReport
import gc  # Garbage collection to free memory

full_data = pd.read_csv("retail_data.csv")

# Generate Simple Random Sample Report
full_data = ProfileReport(full_data, title="Full Dataset")
full_data.to_file("full_data.html")
del srs_report
gc.collect()
'''

'\nimport pandas as pd\nfrom ydata_profiling import ProfileReport\nimport gc  # Garbage collection to free memory\n\nfull_data = pd.read_csv("retail_data.csv")\n\n# Generate Simple Random Sample Report\nfull_data = ProfileReport(full_data, title="Full Dataset")\nfull_data.to_file("full_data.html")\ndel srs_report\ngc.collect()\n'

### Simple Random Sample (S.R.S)

In [None]:
'''
import pandas as pd
from ydata_profiling import ProfileReport
import gc  # Garbage collection to free memory

srs_sample = pd.read_csv("Simple_Random_Sample.csv")

# Generate Simple Random Sample Report
srs_report = ProfileReport(srs_sample, title="Simple Random Sample")
srs_report.to_file("SRS_Report.html")
del srs_report
gc.collect()
'''

'\nimport pandas as pd\nfrom ydata_profiling import ProfileReport\nimport gc  # Garbage collection to free memory\n\nsrs_sample = pd.read_csv("Simple_Random_Sample.csv")\n\n# Generate Simple Random Sample Report\nsrs_report = ProfileReport(srs_sample, title="Simple Random Sample")\nsrs_report.to_file("SRS_Report.html")\ndel srs_report\ngc.collect()\n'

### Systematic

In [None]:
'''
!pip install ydata-profiling

import pandas as pd
from ydata_profiling import ProfileReport
import gc  # Garbage collection to free memory

systematic_sample = pd.read_csv("Systematic_Sample.csv")

# Generate Systematic Sample Report
systematic_report = ProfileReport(systematic_sample, title="Systematic Sample")
systematic_report.to_file("Systematic_Report.html")
del systematic_report
gc.collect()
'''

'\n!pip install ydata-profiling\n\nimport pandas as pd\nfrom ydata_profiling import ProfileReport\nimport gc  # Garbage collection to free memory\n\nsystematic_sample = pd.read_csv("Systematic_Sample.csv")\n\n# Generate Systematic Sample Report\nsystematic_report = ProfileReport(systematic_sample, title="Systematic Sample")\nsystematic_report.to_file("Systematic_Report.html")\ndel systematic_report\ngc.collect()\n'

### Stratified

In [None]:
'''
import pandas as pd
from ydata_profiling import ProfileReport
import gc  # Garbage collection to free memory

stratified_sample = pd.read_csv("Stratified_Sample.csv")

# Generate Stratified Sample Report
stratified_report = ProfileReport(stratified_sample, title="Stratified Sample")
stratified_report.to_file("Stratified_Report.html")
del stratified_report
gc.collect()
'''

'\nimport pandas as pd\nfrom ydata_profiling import ProfileReport\nimport gc  # Garbage collection to free memory\n\nstratified_sample = pd.read_csv("Stratified_Sample.csv")\n\n# Generate Stratified Sample Report\nstratified_report = ProfileReport(stratified_sample, title="Stratified Sample")\nstratified_report.to_file("Stratified_Report.html")\ndel stratified_report\ngc.collect()\n'

## Comparing Full Dataset vs. Each Sampled Dataset

### Full Dataset vs. Simple Random Sample (S.R.S)

In [None]:
import pandas as pd
from ydata_profiling import ProfileReport
import gc

# Load datasets
full_data = pd.read_csv("retail_data.csv")
srs_sample = pd.read_csv("Simple_Random_Sample.csv")

# Generate minimal ProfileReports
full_report = ProfileReport(full_data, title="Full Dataset", minimal=True)
srs_report = ProfileReport(srs_sample, title="Simple Random Sample", minimal=True)

# Generate comparison report
comparison_srs = full_report.compare(srs_report)
comparison_srs.to_file("Full_vs_SRS_Comparison.html")

# Clean up memory
del full_data, srs_sample, full_report, srs_report, comparison_srs
gc.collect()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/78 [00:00<?, ?it/s][A
  1%|▏         | 1/78 [00:02<03:24,  2.65s/it][A
 12%|█▏        | 9/78 [00:02<00:16,  4.23it/s][A
 15%|█▌        | 12/78 [00:12<01:21,  1.24s/it][A
 18%|█▊        | 14/78 [00:22<02:12,  2.07s/it][A
 29%|██▉       | 23/78 [00:22<00:45,  1.20it/s][A
 32%|███▏      | 25/78 [00:24<00:45,  1.16it/s][A
 35%|███▍      | 27/78 [00:40<01:51,  2.18s/it][A
 46%|████▌     | 36/78 [00:41<00:44,  1.05s/it][A
 50%|█████     | 39/78 [00:41<00:33,  1.15it/s][A
 51%|█████▏    | 40/78 [00:42<00:31,  1.22it/s][A
 53%|█████▎    | 41/78 [00:42<00:28,  1.28it/s][A
 54%|█████▍    | 42/78 [00:42<00:24,  1.45it/s][A
 55%|█████▌    | 43/78 [00:42<00:21,  1.60it/s][A
 56%|█████▋    | 44/78 [00:43<00:20,  1.67it/s][A
 58%|█████▊    | 45/78 [00:43<00:19,  1.69it/s][A
 59%|█████▉    | 46/78 [00:44<00:16,  1.97it/s][A
 60%|██████    | 47/78 [00:44<00:16,  1.87it/s][A
 62%|██████▏   | 48/78 [00:45<00:13,  2.20it/s][A
 63%|██████▎   | 49/78 [00:45<00:15,  1.8

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/78 [00:00<?, ?it/s][A
  8%|▊         | 6/78 [00:00<00:01, 49.98it/s][A
 14%|█▍        | 11/78 [00:00<00:01, 49.69it/s][A
 21%|██        | 16/78 [00:00<00:02, 25.51it/s][A
 26%|██▌       | 20/78 [00:00<00:02, 27.35it/s][A
 32%|███▏      | 25/78 [00:00<00:01, 32.47it/s][A
 37%|███▋      | 29/78 [00:01<00:02, 18.88it/s][A
 45%|████▍     | 35/78 [00:01<00:01, 25.31it/s][A
 50%|█████     | 39/78 [00:01<00:01, 27.16it/s][A
 55%|█████▌    | 43/78 [00:01<00:01, 29.12it/s][A
 63%|██████▎   | 49/78 [00:01<00:00, 35.71it/s][A
 69%|██████▉   | 54/78 [00:02<00:01, 17.97it/s][A
 74%|███████▍  | 58/78 [00:03<00:01, 10.37it/s][A
 78%|███████▊  | 61/78 [00:03<00:01, 12.03it/s][A
 85%|████████▍ | 66/78 [00:03<00:00, 15.47it/s][A
100%|██████████| 78/78 [00:03<00:00, 22.16it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

76

### Full Dataset vs. Systematic

In [None]:
import pandas as pd
from ydata_profiling import ProfileReport
import gc

# Load datasets
full_data = pd.read_csv("retail_data.csv")
systematic_sample = pd.read_csv("Systematic_Sample.csv")

# Generate minimal ProfileReports
full_report = ProfileReport(full_data, title="Full Dataset", minimal=True)
systematic_report = ProfileReport(systematic_sample, title="Systematic Sample", minimal=True)

# Generate comparison report
comparison_systematic = full_report.compare(systematic_report)
comparison_systematic.to_file("Full_vs_Systematic_Comparison.html")

# Clean up memory
del full_data, systematic_sample, full_report, systematic_report, comparison_systematic
gc.collect()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/78 [00:00<?, ?it/s][A
  1%|▏         | 1/78 [00:02<03:17,  2.57s/it][A
 12%|█▏        | 9/78 [00:03<00:20,  3.38it/s][A
 15%|█▌        | 12/78 [00:14<01:30,  1.37s/it][A
 17%|█▋        | 13/78 [00:23<02:38,  2.44s/it][A
 35%|███▍      | 27/78 [00:42<01:25,  1.67s/it][A
 54%|█████▍    | 42/78 [00:43<00:29,  1.23it/s][A
 58%|█████▊    | 45/78 [00:43<00:23,  1.40it/s][A
 60%|██████    | 47/78 [00:43<00:20,  1.55it/s][A
 63%|██████▎   | 49/78 [00:44<00:16,  1.72it/s][A
 65%|██████▌   | 51/78 [00:46<00:17,  1.53it/s][A
 67%|██████▋   | 52/78 [00:46<00:15,  1.68it/s][A
 68%|██████▊   | 53/78 [01:16<01:51,  4.48s/it][A
 69%|██████▉   | 54/78 [01:17<01:34,  3.94s/it][A
 71%|███████   | 55/78 [01:19<01:21,  3.54s/it][A
 72%|███████▏  | 56/78 [01:19<01:02,  2.83s/it][A
 74%|███████▍  | 58/78 [01:47<02:19,  6.99s/it][A
 76%|███████▌  | 59/78 [01:48<01:48,  5.69s/it][A
 79%|███████▉  | 62/78 [01:48<00:48,  3.06s/it][A
 81%|████████  | 63/78 [01:49<00:39,  2.6

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/78 [00:00<?, ?it/s][A
  6%|▋         | 5/78 [00:00<00:01, 43.23it/s][A
 13%|█▎        | 10/78 [00:00<00:01, 42.91it/s][A
 19%|█▉        | 15/78 [00:00<00:04, 13.59it/s][A
 24%|██▍       | 19/78 [00:01<00:03, 17.22it/s][A
 31%|███       | 24/78 [00:01<00:02, 22.62it/s][A
 36%|███▌      | 28/78 [00:01<00:03, 14.97it/s][A
 45%|████▍     | 35/78 [00:01<00:02, 21.33it/s][A
 50%|█████     | 39/78 [00:01<00:01, 21.73it/s][A
 55%|█████▌    | 43/78 [00:02<00:01, 23.17it/s][A
 59%|█████▉    | 46/78 [00:02<00:01, 23.98it/s][A
 68%|██████▊   | 53/78 [00:02<00:01, 13.64it/s][A
 74%|███████▍  | 58/78 [00:03<00:01, 10.01it/s][A
 81%|████████  | 63/78 [00:03<00:01, 12.29it/s][A
 88%|████████▊ | 69/78 [00:04<00:00, 15.99it/s][A
100%|██████████| 78/78 [00:04<00:00, 18.29it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

79

### Full Dataset vs. Stratified

In [None]:
import pandas as pd
from ydata_profiling import ProfileReport
import gc

# Load datasets
full_data = pd.read_csv("retail_data.csv")
stratified_sample = pd.read_csv("Stratified_Sample.csv")

# Generate minimal ProfileReports
full_report = ProfileReport(full_data, title="Full Dataset", minimal=True)
stratified_report = ProfileReport(stratified_sample, title="Stratified Sample", minimal=True)

# Generate comparison report
comparison_stratified = full_report.compare(stratified_report)
comparison_stratified.to_file("Full_vs_Stratified_Comparison.html")

# Clean up memory
del full_data, stratified_sample, full_report, stratified_report, comparison_stratified
gc.collect()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/78 [00:00<?, ?it/s][A
  1%|▏         | 1/78 [00:03<05:02,  3.93s/it][A
  3%|▎         | 2/78 [00:04<02:10,  1.72s/it][A
  9%|▉         | 7/78 [00:04<00:26,  2.69it/s][A
 12%|█▏        | 9/78 [00:05<00:25,  2.72it/s][A
 13%|█▎        | 10/78 [00:05<00:23,  2.96it/s][A
 14%|█▍        | 11/78 [00:05<00:21,  3.10it/s][A
 15%|█▌        | 12/78 [00:16<03:02,  2.77s/it][A
 17%|█▋        | 13/78 [00:25<04:42,  4.34s/it][A
 35%|███▍      | 27/78 [00:45<01:36,  1.89s/it][A
 53%|█████▎    | 41/78 [00:45<00:32,  1.13it/s][A
 55%|█████▌    | 43/78 [00:46<00:29,  1.20it/s][A
 58%|█████▊    | 45/78 [00:46<00:24,  1.34it/s][A
 60%|██████    | 47/78 [00:46<00:20,  1.54it/s][A
 63%|██████▎   | 49/78 [00:47<00:16,  1.78it/s][A
 64%|██████▍   | 50/78 [00:48<00:16,  1.68it/s][A
 68%|██████▊   | 53/78 [01:22<01:44,  4.18s/it][A
 69%|██████▉   | 54/78 [01:23<01:27,  3.64s/it][A
 73%|███████▎  | 57/78 [01:24<00:52,  2.49s/it][A
 74%|███████▍  | 58/78 [01:53<02:08,  6.42s

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/78 [00:00<?, ?it/s][A
  1%|▏         | 1/78 [00:00<00:07,  9.96it/s][A
  8%|▊         | 6/78 [00:00<00:02, 28.18it/s][A
 13%|█▎        | 10/78 [00:00<00:02, 26.77it/s][A
 17%|█▋        | 13/78 [00:01<00:07,  9.19it/s][A
 28%|██▊       | 22/78 [00:01<00:02, 19.48it/s][A
 33%|███▎      | 26/78 [00:01<00:02, 21.59it/s][A
 38%|███▊      | 30/78 [00:02<00:04, 11.68it/s][A
 46%|████▌     | 36/78 [00:02<00:02, 16.58it/s][A
 51%|█████▏    | 40/78 [00:02<00:01, 19.44it/s][A
 56%|█████▋    | 44/78 [00:02<00:01, 20.38it/s][A
 62%|██████▏   | 48/78 [00:02<00:01, 23.05it/s][A
 68%|██████▊   | 53/78 [00:03<00:01, 15.08it/s][A
 74%|███████▍  | 58/78 [00:04<00:02,  8.35it/s][A
 81%|████████  | 63/78 [00:04<00:01, 10.97it/s][A
100%|██████████| 78/78 [00:04<00:00, 16.37it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

117