# Reaserch oriented coding - Modularization

## We want to create "modules"= functions that we can reuse.
### lets look at the following simple example

In [None]:
import pandas as pd

data1 = pd.read_csv("data1.csv")
data2 = pd.read_csv("data2.csv")

data1 = data1.dropna()
data1 = data1.drop_duplicates()
data1 = data1.reset_index(drop=True)
data1 = data1.rename(columns={"old_name": "new_name"})

data2 = data2.dropna()
data2 = data2.drop_duplicates()
data2 = data2.reset_index(drop=True)
data2 = data2.rename(columns={"old_name": "new_name"})


The process of the data is the same. Then, we can create a function that does this "processing"

In [None]:
import pandas as pd

data1 = pd.read_csv("data1.csv")
data2 = pd.read_csv("data2.csv")

my_old_name = "old_name"
my_new_name = "new_name"


def process_data(data, new_name, old_name):
    data = data.dropna()
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data = data.rename(columns={old_name: new_name})

    return data


data1 = process_data(data1, my_old_name, my_new_name)
data2 = process_data(data2, my_old_name, my_new_name)


Ok, so now we have our function, lets use it.

In [None]:
import pandas as pd

data1 = pd.read_csv("data1.csv")
data2 = pd.read_csv("data2.csv")

my_old_name = "old_name"
my_new_name = "new_name"

data1 = process_data(data1, my_old_name, my_new_name)
data2 = process_data(data2, my_old_name, my_new_name)


Clearly, the code now looks much cleaner. 
Additionally, if we want to make a change in the processing, that will happend for all the cases at once.

# Positional and keyword arguments


Our functions recieve 3 arguments: data, old name and new name.
Before, we pass the argument in the same order as we defined in the function. This is a positional argument

In [None]:
my_old_name = "old_name"
my_new_name = "new_name"

data1 = process_data(data1, my_new_name, my_old_name)
data2 = process_data(data2, my_new_name, my_old_name)

However, we can also give the name of each argument to the function as follows:

In [None]:
my_old_name = "old_name"
my_new_name = "new_name"

data1 = process_data(data=data1, old_name=my_old_name, new_name=my_new_name)
data2 = process_data(data=data2, old_name=my_old_name, new_name=my_new_name)

In [None]:
my_old_name = "old_name"
my_new_name = "new_name"

# Position now is irrelevant
data1 = process_data(data=data1, new_name=my_new_name, old_name=my_old_name)
data2 = process_data(data=data2, new_name=my_new_name, old_name=my_old_name)

As you can see, the sintax requires more typing and can become too long. But there is a key advantage, we will be sure that the function is recieving in each 
However, keyword arguments are prefere to avoid confusions. For example, did you realize that in the first example we pass the "old" and "new" name in the wrong order? Maybe you did, but I can asure you that this happends quite often.

# Typing
## Typing allows the function to know in advance what kinds of data each variable will be.

In [None]:
def process_data(data: pd.DataFrame, 
                 old_name: str,
                 new_name: str) -> pd.DataFrame:
    data = data.dropna()
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data = data.rename(columns={old_name: new_name})

    return data


# Defaults

In [None]:
def process_data(data: pd.DataFrame,
                 old_name: str,
                 new_name: str = "cool_name") -> pd.DataFrame:

    data = data.dropna()
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data = data.rename(columns={old_name: new_name})

    return data


# Typing vs argument validation

In [None]:
def process_data(data: pd.DataFrame,
                 old_name: str,
                 new_name: str = "cool_name") -> pd.DataFrame:

    if not isinstance(old_name, str):
        raise TypeError(
            f"Invalid value for 'old_name': {type(old_name)} (expected str)"
        )

    if not isinstance(new_name, str):
        raise TypeError(
            f"Invalid value for 'new_name': {type(new_name)} (expected str)"
        )

    # Assert arguments
    assert old_name != "Patient_ID", "Not allowed to rename 'Patient_ID' column"

    data = data.dropna()
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data = data.rename(columns={old_name: new_name})

    # Assert functionality
    assert data.shape[0] == 10

    return data


# Docstring

In [None]:
def process_data(data: pd.DataFrame,
                 old_name: str,
                 new_name: str = "cool_name") -> pd.DataFrame:
    """
    Processes a pandas DataFrame by cleaning and renaming a column.

    Parameters
    ----------
    data : pd.DataFrame
        The input DataFrame to process.
    old_name : str
        The name of the column to rename.
    new_name : str, optional
        The new name for the column. Default is "cool_name".

    Returns
    -------
    pd.DataFrame
        The processed DataFrame with cleaned data and renamed column.

    Raises
    ------
    TypeError
        If `old_name` or `new_name` is not a string.
    AssertionError
        If `old_name` is "Patient_ID".
        If the resulting DataFrame does not have exactly 10 rows.

    Notes
    -----
    - The function removes missing values and duplicates.
    - The index is reset after cleaning.
    """

    if not isinstance(old_name, str):
        raise TypeError(
            f"Invalid value for 'old_name': {type(old_name)} (expected str)"
        )

    if not isinstance(new_name, str):
        raise TypeError(
            f"Invalid value for 'new_name': {type(new_name)} (expected str)"
        )

    # Assert arguments
    assert old_name != "Patient_ID", "Not allowed to rename 'Patient_ID' column"

    data = data.dropna()
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data = data.rename(columns={old_name: new_name})

    # Assert functionality
    assert data.shape[0] == 10

    return data



# Referencing

In [None]:
raw_data = [1, 2, 3]  # create a "raw data"
process_data = raw_data  # Create a new variable for processing
process_data.append(4)  # do some processing

print(raw_data)
print(process_data)

[1, 2, 3, 4]
[1, 2, 3, 4]


In [None]:
raw_data = [1, 2, 3]  # create a "raw data"
process_data = raw_data.copy()  # Create a new variable for processing
process_data.append(4)  # do some processing

print(raw_data)
print(process_data)

[1, 2, 3]
[1, 2, 3, 4]


### In a more bit more complex scenario, where we are doing several processing to the data, the conflicts are clear

In [None]:
raw_data = [1, 2, 3]  # create a "raw data"
process_data = raw_data  # Create a new variable for processing
process_data.append(4)  # do some processing

process_data2 = raw_data  # Create for a different processing
process_data2.append(5)  # do some processing

print(raw_data)
print(process_data)
print(process_data2)


[1, 2, 3, 4, 5]
[1, 2, 3, 4, 5]
[1, 2, 3, 4, 5]


In [None]:
raw_data = [1, 2, 3]  # create a "raw data"
process_data = raw_data.copy()  # create a new variable for processing
process_data.append(4)  # do some processing

process_data2 = raw_data.copy()  # create for a different processing
process_data2.append(5)  # do some processing

print(raw_data)
print(process_data)
print(process_data2)


[1, 2, 3]
[1, 2, 3, 4]
[1, 2, 3, 5]


### But what happend with functions?
#### Lets create a simple function and see how the data is affectes

In [1]:
def process_data_adding_values(data: list,
                               value: int) -> list:
    """
    Adds a value to the data.

    Parameters
    ----------
    data : list
        The input data to process.
    value : int
        The value to add to the data.

    Returns
    -------
    list
        The processed data with the added value.
    """
    data.append(value)
    return data

raw_data = [1, 2, 3]  # create a "raw data"

process_data = process_data_adding_values(raw_data, 4)

print(raw_data)
print(process_data)


[1, 2, 3, 4]
[1, 2, 3, 4]


**So functions also work as referencing!** 
In a way, the referencing is quite usefull to the functions. It allows the program *not* to copy the original data each time. This is useful when we are using GB of data is quite useful because if not we would be duplicating the RAM need.
However, this is not always intuitive and it can be the case that we want to create a new intance of the data.

## Let's check if there is a difference between positional and keyword arguments on this

In [2]:
raw_data = [1, 2, 3]  # create a "raw data"

process_data = process_data_adding_values(data=raw_data, value=4)

print(raw_data)
print(process_data)


[1, 2, 3, 4]
[1, 2, 3, 4]


As expected, the functions works the same, no matter how the arguments are passed to the function.

# Random seed and randoms state

In [None]:
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from matplotlib

X = np.random.rand(100, 10)  # Example feature set
y = np.random.randint(0, 2, size=100)  # Example target variable
# Set random seeds for reproducibility
SEED = 23

# 1. Python built-in random
random.seed(SEED)

# 2. NumPy
np.random.seed(SEED)

# 3. Cross-validation obbjects
# For train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

# 4. Models
# For models with randomness
model = RandomForestClassifier(random_state=SEED)

# Ensure dir

One of the worst things that can happen is that after running our experiment (which can take months!), the results are
not saved because the directory where we are trying to save the data does not exist.​


We can create a function that ensures the directory exists :)​

​
For this, it is usually better to use relative paths! Absolute paths can present some problems (like permissions or difference
in the OS).​

Is a good practice to "ensure" the output directory **In the beggining of the experiment**, so if the code fails, is before running the heavy computational part. 

In [None]:
import os

def ensure_dir(file_path: str) -> None:
    """
    Ensures the directory for the given file path exists.

    Args:
        file_path (str): The file path whose directory should be created.
    """
    dir_path = os.path.dirname(file_path)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path, exist_ok=True)

# Emergency dir

We can handle this exception and create an "emergency" path to save our results and then figure out what went wrong (but
with our results safely stored!)​


In [None]:
import sys
from pathlib import Path  # Better path handling

def ensure_dir(target_path: str, emergency_base: str = "emergency_dir") -> str:
    """
    Ensure a directory exists at `target_path`. If creation fails, 
    fall back to an emergency directory in the current working directory.

    Args:
        target_path (str): Desired absolute/relative directory path.
        emergency_base (str): Base name for emergency directory (default: "emergency_dir").

    Returns:
        str: Path to the ensured directory (either target or emergency).
    """
    target_path = Path(target_path).absolute()  # Convert to absolute path
    emergency_dir = Path.cwd() / emergency_base  # Emergency dir in current working dir

    try:
        # Attempt to create target directory
        target_path.mkdir(parents=True, exist_ok=True)
        return str(target_path)
    except (OSError, PermissionError) as e:
        # Fallback: Create emergency directory
        print(
            f"Failed to create directory at '{target_path}': {e}\n"
            f"Using emergency directory: '{emergency_dir}'",
            file=sys.stderr
        )
        emergency_dir.mkdir(exist_ok=True)  # Safe by design (current dir is writable)
        return str(emergency_dir)

# Example usage
results_save_dir = ensure_dir("./output/experiment_1")  # Might fail (permissions)
print(f"Created directory: {results_save_dir}")

Using directory: /home/nnieto/Nico/Cursos/FZJ - Seminars/Fundamentals_of_Unix_Terminal_and_Programming/4 - Research oriented coding/modularization/output/experiment_1
