In [1]:
import pandas as pd
import numpy as np
import cluster_maker as cm


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Explanation of the function define_dataframe_structure

In [None]:

def define_dataframe_structure(column_specs):
    """
    Define the structure of the DataFrame based on the column specifications. 

    Args: 
    - column_specs: a list of dictionaries, each dictionary contains the following keys:
        - name: the name of the column
        - reps: a list of representative points for the column
    Returns:
    - A DataFrame with the following structure. 
        - The name of each column in the DataFrame corresponds to the value associated to the key 'name' in each dictionary in the column_specs list.
        - The number of rows in the data frame will be equal to the length of the list of representative points in the dictionary with the largest number of representative points.
        - The values in each column are the representative points in the list associated to the key 'reps' in each dictionary in the column_specs list. 
          If the list of representative points is shorter than the list with the largest number of representative points, the remaining values in the column are NaN.
    """
    # Prepare data dictionary
    data = {}
    max_length = 0

    # Find the maximum length of representative points
    for spec in column_specs:
        max_length = max(max_length, len(spec.get('reps', [])))

    for spec in column_specs:
        name = spec['name']
        reps = spec.get('reps', [])
        # Extend numerical columns with NaN to match max_length
        extended_points = reps + [np.nan] * (max_length - len(reps))
        data[name] = extended_points

    return pd.DataFrame(data)


The `define_dataframe_structure` function is used to define the structure of a DataFrame based on the column specifications provided. It takes a list of dictionaries as input, where each dictionary represents a column in the DataFrame. Each dictionary contains the keys 'name' and 'reps'. 

The function creates a DataFrame with the following structure:
- The column names in the DataFrame correspond to the values associated with the 'name' key in each dictionary.
- The number of rows in the DataFrame is equal to the length of the list of representative points in the dictionary with the largest number of representative points.
- The values in each column are the representative points in the list associated with the 'reps' key in each dictionary. If the list of representative points is shorter than the list with the largest number of representative points, the remaining values in the column are filled with NaN.

The function iterates over the column specifications, finds the maximum length of representative points, and extends the numerical columns with NaN values to match the maximum length. Finally, it returns the DataFrame with the defined structure.

In [None]:

def simulate_data(seed_df, n_points=100, col_specs=None, random_state=None):
    """
    The simulate_data function is used to generate data based on the seed DataFrame. 
    
    args: 
        - seed_df: a DataFrame with the structure defined by the define_dataframe_structure function.
        - n_points: the number of simulated points to generate for each representative point in the seed DataFrame.
        - col_specs: a dictionary that specifies the distribution of the data for each column. If not provided, an error will be raised.
        - random_state: an integer that specifies the random seed for reproducibility. If not provided, the results will not be reproducible.
    return:
        - A DataFrame with the simulated data based on the seed DataFrame and the column specifications where each row represents a simulated point
          and each column represents a feature.
    """
    if random_state is not None:
        np.random.seed(random_state)
    
    simulated_data = []

    for _, representative in seed_df.iterrows():
        for _ in range(n_points):
            simulated_point = {}
            for col in seed_df.columns:
                # Numerical columns: apply column-specific specifications
                if col_specs and col in col_specs:
                    dist = col_specs[col].get('distribution', 'normal')
                    variance = col_specs[col].get('variance', 1.0)

                    if dist == 'normal':
                        simulated_point[col] = representative[col] + np.random.normal(0, np.sqrt(variance))
                    elif dist == 'uniform':
                        simulated_point[col] = representative[col] + np.random.uniform(-variance, variance)
                    else:
                        raise ValueError(f"Unsupported distribution: {dist}")
                else:
                    raise ValueError(f"Column {col} has no specifications in col_specs.")
            simulated_data.append(simulated_point)
    
    return pd.DataFrame(simulated_data)

The `simulate_data` function is used to generate simulated data based on a seed DataFrame. 

Here's a brief explanation of the function:

- `seed_df`: The seed DataFrame contains the structure of the data to be simulated. Each row represents a representative point, and each column represents a feature.
- `n_points`: The number of simulated points to generate for each representative point in the seed DataFrame.
- `col_specs`: A dictionary that specifies the distribution of the data for each column. If not provided, an error will be raised.
- `random_state`: An integer that specifies the random seed for reproducibility. If not provided, the results will not be reproducible.

The function iterates over each representative point in the seed DataFrame and generates `n_points` simulated points for each representative. For each simulated point, it applies column-specific specifications (if provided) to generate the values for each feature. The supported distributions are 'normal' and 'uniform'. If a column has no specifications in `col_specs`, an error will be raised.

The function returns a DataFrame with the simulated data, where each row represents a simulated point and each column represents a feature.

In [None]:

def export_formatted(data, filename):
    """
    Export the DataFrame to a formatted text file.

    Parameters:
        data (pd.DataFrame): The DataFrame to export.
        filename (str): Name of the output text file.

    Returns:
        None
    """
    def missing_values(data):
        """
        Calculate the percentage of missing values and zeros in each column of a given DataFrame.
        
        Parameters:
        data (DataFrame): The input DataFrame.
        
        Returns:
        DataFrame: A DataFrame containing the column name, number of unique values, percentage of missing values, and percentage of zeros.
        """
        df = pd.DataFrame()
        for col in list(data):
            unique_values = data[col].unique()
            try:
                unique_values = np.sort(unique_values)
            except:
                pass
            nans = round(pd.isna(data[col]).sum()/data.shape[0]*100,1 )
            zeros = round( (data[col]==0).sum()/data.shape[0]*100,1 )
            df = pd.concat([df, pd.DataFrame([col, len(unique_values), nans, zeros]).T])
        return df.rename(columns={0:'variable',1:'Unique values',2:'Nan %',3:'zeros %'}).sort_values('Nan %', ascending=False)

    df_info = missing_values(data)
    try:
        with open("data/"+filename, 'w') as file:
            file.write(f"Data information:\n\n")
            file.write(f"Number of rows: {data.shape[0]}\n")
            file.write(f"Number of columns: {data.shape[1]}\n\n ")  

            df_info.to_string(file)
            file.write(f"\n\nData:\n\n")
            data.to_string(file)
        print(f"Data successfully exported to {filename}")
    except Exception as e:
        print(f"Error exporting data to formatted text file: {e}")



The `export_formatted` function is used to export a DataFrame to a formatted text file. 

Here's a brief explanation of the function:

- `data`: The input DataFrame that needs to be exported.
- `filename`: The name of the output text file.

The function also includes a nested function called `missing_values`, which calculates the percentage of missing values and zeros in each column of the input DataFrame.

The `export_formatted` function first calls the `missing_values` function to generate a DataFrame containing information about the columns, such as the number of unique values, the percentage of missing values, and the percentage of zeros.

Then, it opens the output text file and writes the data information, including the number of rows and columns, followed by the information from the `missing_values` DataFrame. Finally, it writes the actual data from the input DataFrame.

If the export is successful, it prints a message indicating the successful export. If there is an error, it prints an error message with the specific exception encountered.

In [None]:

def non_globular_cluster(seed_df, n_points=100, col_specs=None, random_state=None, cluster_params=None):
    """
    Simulates non-globular clusters based on the seed DataFrame.

    Args:
    - seed_df: a DataFrame with the structure defined by the define_dataframe_structure function.
    - n_points: the number of simulated points to generate for each representative point in the seed DataFrame.
    - col_specs: a dictionary that specifies the distribution of the data for each column. If not provided, an error will be raised.
    - random_state: an integer that specifies the random seed for reproducibility. If not provided, the results will not be reproducible.
    - cluster_params: a dictionary that contains the parameters for simulating non-globular clusters.
    Returns:
    - A DataFrame with the simulated non-globular clusters based on the seed DataFrame and the column specifications where each row represents a simulated point
      and each column represents a feature.
    """
    def apply_fun(x, fun,slope=1,intercept=0):
        """
        Apply a specified mathematical function to the input x.
        
        args:
            - x: The input value.
            - fun: The name of the mathematical function to apply. 
                Supported functions:  'exp', 'log', 'sqrt', 'lineal'
        
        return:
            - The result of applying the specified function to x.
        """
        if fun == 'exp':
            return np.exp(x)
        elif fun == 'log':
            return np.log(x)
        elif fun == 'sqrt':
            return np.sqrt(x)
        elif fun == 'linear':
            return slope*x + intercept
        else:
            raise ValueError(f"Unsupported function: {fun}")

    if random_state is not None:
        np.random.seed(random_state)

    if cluster_params is None:
        raise ValueError("Cluster parameters are required for simulating non-globular clusters.")

    simulated_data = []

    for _, representative in seed_df.iterrows():
        for _ in range(n_points):
            simulated_point = {}
            for col in seed_df.columns:
                if col_specs and col in col_specs:
                    dist = col_specs[col].get('distribution', 'normal')
                    variance = col_specs[col].get('variance', 1.0)

                    if dist == 'normal':
                        simulated_point[col] = representative[col] + np.random.normal(0, np.sqrt(variance))
                    elif dist == 'uniform':
                        simulated_point[col] = representative[col] + np.random.uniform(-variance, variance)
                    else:
                        raise ValueError(f"Unsupported distribution: {dist}")
                else:
                    raise ValueError(f"Column {col} has no specifications in col_specs.")
            # Apply non-globular transformation
            for col in cluster_params:
                if col in simulated_point:
                    simulated_point[col] =  apply_fun(simulated_point[col],cluster_params[col].get('fun',lambda x:x)) #simulated_point[col] * cluster_params[col]
            simulated_data.append(simulated_point)

    return pd.DataFrame(simulated_data)

The `non_globular_cluster` function is used to simulate non-globular clusters based on a seed DataFrame. Here's a brief explanation of the function:

- `seed_df`: The seed DataFrame contains the structure of the data to be simulated. Each row represents a representative point, and each column represents a feature.
- `n_points`: The number of simulated points to generate for each representative point in the seed DataFrame.
- `col_specs`: A dictionary that specifies the distribution of the data for each column. If not provided, an error will be raised.
- `random_state`: An integer that specifies the random seed for reproducibility. If not provided, the results will not be reproducible.
- `cluster_params`: A dictionary that contains the parameters for simulating non-globular clusters.

The function first checks if the `cluster_params` dictionary is provided. If not, it raises a `ValueError` indicating that cluster parameters are required.

Then, it initializes an empty list called `simulated_data` to store the simulated points.

The function iterates over each representative point in the seed DataFrame and generates `n_points` simulated points for each representative. For each simulated point, it applies column-specific specifications (if provided) to generate the values for each feature. The supported distributions are 'normal' and 'uniform'. If a column has no specifications in `col_specs`, an error will be raised.

After generating the simulated point, the function applies a non-globular transformation to the specified columns. The transformation is specified by the `cluster_params` dictionary, which contains the column names as keys and the transformation functions as values. The supported transformation functions are 'exp', 'log', 'sqrt', and 'linear'.

Finally, the function appends the simulated point to the `simulated_data` list.

The function returns a DataFrame with the simulated non-globular clusters, where each row represents a simulated point and each column represents a feature.

In [18]:
functions_in_cm = [func for func in dir(cm) if callable(getattr(cm, func))]
print(functions_in_cm)


['define_dataframe_structure', 'export_formatted', 'export_to_csv', 'non_globular_cluster', 'simulate_data']


In [19]:
column_specs = [{'name': 'A', 'reps': [1, 2, 3]},
                {'name': 'B', 'reps': [4, 5]},
                {'name': 'C', 'reps': [6, 7, 8, 9]}]

In [20]:
col_specs = {'A':{'distibution':'normal','variance':0.1},
            'B':{'distribution':'uniform','variance':0.2},
            'C':{'distribution':'normal','variance':0.3}}

In [21]:
cluster_params = {
    'A': {'fun':'exp'},
    'B': {'fun':'sqrt'},
    'C': {'fun':'sqrt'}
}

In [22]:
df = cm.define_dataframe_structure(column_specs)
simulated_data = cm.simulate_data(df, n_points=100, col_specs=col_specs)
cm.non_globular_cluster(df, n_points=100, col_specs=col_specs, cluster_params=cluster_params)

Unnamed: 0,A,B,C
0,2.309679,2.011211,2.624858
1,2.592190,1.979347,2.357392
2,2.996528,2.000744,2.421907
3,2.530441,1.989438,2.481485
4,1.405374,1.965387,2.221612
...,...,...,...
395,,,3.004432
396,,,3.037800
397,,,3.007002
398,,,2.873946


In [6]:
cm.export_formatted(simulated_data, 'simulated_data.txt')

Data successfully exported to simulated_data.txt


In [13]:
# Define the structure of the seed DataFrame
column_specs = {
    'column1': {'distribution': 'normal', 'variance': 1.0},
    'column2': {'distribution': 'uniform', 'variance': 0.5},
    'column3': {'distribution': 'normal', 'variance': 0.2}
}

# Create the seed DataFrame
seed_df = pd.DataFrame({
    'column1': [1, 2, 3],
    'column2': [4, 5, 6],
    'column3': [7, 8, 9]
})

# Set the random seed for reproducibility
random_state = 42

# Define the parameters for simulating non-globular clusters
cluster_params = {
    'column1': {'fun':'exp'},
    'column2': {'fun':'sqrt'},
    'column3': {'fun':'sqrt'}
}


In [15]:
cm.non_globular_cluster(seed_df, n_points=100, col_specs=column_specs, cluster_params=cluster_params)

Unnamed: 0,column1,column2,column3
0,3.635551,1.889136,2.522889
1,14.829927,1.964552,2.651836
2,1.978799,2.025542,2.568173
3,7.245922,1.904517,2.593755
4,4.260711,1.892024,2.754926
...,...,...,...
295,24.700105,2.422413,2.977004
296,20.788696,2.496718,3.070961
297,18.806823,2.524573,2.937278
298,19.762542,2.471421,2.986785


In [17]:
seed_df


Unnamed: 0,column1,column2,column3
0,1,4,7
1,2,5,8
2,3,6,9
