In [13]:
import nbformat as nbf
import os

# Create a list of dataset file paths or names same as input files
datasets = [f'datasets/pair{str(i).zfill(4)}.txt' for i in range(1, 101)]

# Directory to save the experiment notebooks
notebook_dir = 'experiment'
os.makedirs(notebook_dir, exist_ok=True)

# Loop over each dataset for generating sample code for notebooks
for i, dataset in enumerate(datasets):
    # Create a new notebook
    nb = nbf.v4.new_notebook()

    # Define the cells for the notebook
    cells = []

    # Extract the name of the dataset from input file (e.g., 'pair0001.txt' -> 'pair0001')
    base_filename = os.path.splitext(os.path.basename(dataset))[0]

    # Add a markdown heading cell to show the pair name at the top
    title = f"## {base_filename}: \n"
    cells.insert(0, nbf.v4.new_markdown_cell(title))

    # Code to import necessary libraries for generated notebook files 
    imports = """
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from diptest import diptest
    """
    cells.append(nbf.v4.new_code_cell(imports))

    # Code to load the dataset in new cell
    load_data = f"""
# Load dataset
df = pd.read_csv('{dataset}', delimiter=' ', header=None, names=['xa', 'ya'])
    """
    cells.append(nbf.v4.new_code_cell(load_data))

    # Code to display few rows of the dataset
    head_data = """
# Display few rows of the dataset
print(df.head())
    """
    cells.append(nbf.v4.new_code_cell(head_data))

    # Code to generate scatter plot for the datasets
    scatter_plot = """
# Scatter Plot
plt.scatter(df['xa'], df['ya'])
plt.xlabel('xa')
plt.ylabel('ya')
plt.title('Scatter plot of xa vs ya')
plt.show()
    """
    cells.append(nbf.v4.new_code_cell(scatter_plot))

    # Code for generating the bidirectional histplot for visualizing the distribution of the data
    bidirectional_histplot = """
# Plot the distribution of the data in both directions (x and y)
fig, axis = plt.subplots(1, 2, figsize=(16,6))

# Plot for xa -> ya (X -> Y)
sns.histplot(df['xa'], kde=True, ax=axis[0])
axis[0].set_xlabel('xa')
axis[0].set_ylabel('ya')
axis[0].set_title('xa distribution with KDE')

# Plot for xa -> ya (Y -> X)
sns.histplot(df['ya'], kde=True, ax=axis[1])
axis[1].set_xlabel('xa')
axis[1].set_ylabel('ya')
axis[1].set_title('ya distribution with KDE')

# Adjust layout to set the title and labels
plt.tight_layout()
plt.show()
    """
    cells.append(nbf.v4.new_code_cell(bidirectional_histplot))

    # Code for the dip test
    dip_test = """
# Dip-Test for x and y
dip_test, p_val = diptest(np.array(df['xa']))
print(f"Dip-Test (X -> Y): {dip_test}, p-value: {p_val}")   
dip_test, p_val = diptest(np.array(df['ya']))
print(f"Dip-Test (Y -> X): {dip_test}, p-value: {p_val}")
    """
    cells.append(nbf.v4.new_code_cell(dip_test))

    # Add all the cells to the generated notebook
    nb['cells'] = cells
    # Path and format to save the name of the generated notebook
    gen_notebook = os.path.join(notebook_dir, f'{base_filename}_test.ipynb')

    # Write the notebook to file
    with open(gen_notebook, 'w') as f:
        nbf.write(nb, f)

    print(f"Pair_test {i+1}/{len(datasets)}: {gen_notebook}")


Pair_test 1/100: experiment\pair0001_test.ipynb
Pair_test 2/100: experiment\pair0002_test.ipynb
Pair_test 3/100: experiment\pair0003_test.ipynb
Pair_test 4/100: experiment\pair0004_test.ipynb
Pair_test 5/100: experiment\pair0005_test.ipynb
Pair_test 6/100: experiment\pair0006_test.ipynb
Pair_test 7/100: experiment\pair0007_test.ipynb
Pair_test 8/100: experiment\pair0008_test.ipynb
Pair_test 9/100: experiment\pair0009_test.ipynb
Pair_test 10/100: experiment\pair0010_test.ipynb
Pair_test 11/100: experiment\pair0011_test.ipynb
Pair_test 12/100: experiment\pair0012_test.ipynb
Pair_test 13/100: experiment\pair0013_test.ipynb
Pair_test 14/100: experiment\pair0014_test.ipynb
Pair_test 15/100: experiment\pair0015_test.ipynb
Pair_test 16/100: experiment\pair0016_test.ipynb
Pair_test 17/100: experiment\pair0017_test.ipynb
Pair_test 18/100: experiment\pair0018_test.ipynb
Pair_test 19/100: experiment\pair0019_test.ipynb
Pair_test 20/100: experiment\pair0020_test.ipynb
Pair_test 21/100: experiment\