In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [4]:
# Create the dataframe
# data = {
#    'Feature1': [1.5, 2.0, 1.8, np.nan, 2.2],
#    'Feature2': [3.2, 2.1, 2.5, 2.9, 3.6],
#    'Feature3': [5.1, 4.9, 5.3, 4.7, 5.0],
#    'Feature4': [6.5, 6.7, np.nan, 6.4, 6.1],
#    'Category': [1, 2, 3, 2, 1]
# }

# data = pd.DataFrame(data)

import pandas as pd
import numpy as np

# Create the dataframe
data = {
    'Feature1': [1.5, 2.0, 1.8, np.nan, 2.2, 3.7, 4.1, 4.3, 3.3, 3.9],
    'Feature2': [3.2, 2.5, np.nan, 2.9, 3.6, 4.2, 4.8, 4.4, 4.7, 4.6],
    'Feature3': [5.1, 4.9, 5.3, 4.7, 5.0, 5.2, 6.2, 6.9, 5.8, 6.0],
    'Feature4': [6.5, 6.7, np.nan, 6.4, 6.1, 7.0, 6.1, 7.2, 7.5, 6.9],
    'Category': [1, 2, 3, 2, 1, 2, 3, 1, 2, 3]
}

data = pd.DataFrame(data)

# Display the dataframe
data

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category
0,1.5,3.2,5.1,6.5,1
1,2.0,2.5,4.9,6.7,2
2,1.8,,5.3,,3
3,,2.9,4.7,6.4,2
4,2.2,3.6,5.0,6.1,1
5,3.7,4.2,5.2,7.0,2
6,4.1,4.8,6.2,6.1,3
7,4.3,4.4,6.9,7.2,1
8,3.3,4.7,5.8,7.5,2
9,3.9,4.6,6.0,6.9,3


In [3]:
def filter_missing(df, axis='rows'):
    if axis == 'rows':
        df = df.dropna(axis=0)
        return df
    elif axis == 'columns' or axis == "cols":
        df = df.dropna(axis=1)    
        return df
    else:
        raise ValueError(f"Invalid axis option '{axis}'. Please choose 'rows' or 'columns'.")

def collect_missing(df):
    df = df[df.isna().any(axis=1)]
    return df

def determine_missing_proportions(df):
    f = []; p = []
    for col in df.columns:
        f.append(col)
        p.append(df[col].isna().sum()/len(df))
    d = dict(zip(f, p))
    return(d)

In [11]:
nm = filter_missing(data)
nm

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category
0,1.5,3.2,5.1,6.5,1
1,2.0,2.5,4.9,6.7,2
4,2.2,3.6,5.0,6.1,1
5,3.7,4.2,5.2,7.0,2
6,4.1,4.8,6.2,6.1,3
7,4.3,4.4,6.9,7.2,1
8,3.3,4.7,5.8,7.5,2
9,3.9,4.6,6.0,6.9,3


In [12]:
cm = collect_missing(data)
cm

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category
2,1.8,,5.3,,3
3,,2.9,4.7,6.4,2


In [13]:
res = determine_missing_proportions(data)
res

{'Feature1': 0.1,
 'Feature2': 0.1,
 'Feature3': 0.0,
 'Feature4': 0.1,
 'Category': 0.0}

In [None]:
for ind in cm.index:
    print(np.array(cm.loc[[ind]].dropna(axis=1)))

In [None]:
"""
!pip install wget 

import wget
wget.download('https://raw.githubusercontent.com/BorisMuzellec/MissingDataOT/master/utils.py')

import numpy as np
import pandas as pd
from utils import *
import torch
import seaborn as sns
"""

In [None]:
"""
# Function produce_NA for generating missing values ------------------------------------------------------

def produce_NA(X, p_miss, mecha="MCAR", opt=None, p_obs=None, q=None):
    #\"""
    from: https://rmisstastic.netlify.app/how-to/python/generate_html/how%20to%20generate%20missing%20values
    Generate missing values for specifics missing-data mechanism and proportion of missing values. 
    
    Parameters
    ----------
    X : torch.DoubleTensor or np.ndarray, shape (n, d)
        Data for which missing values will be simulated.
        If a numpy array is provided, it will be converted to a pytorch tensor.
    p_miss : float
        Proportion of missing values to generate for variables which will have missing values.
    mecha : str, 
            Indicates the missing-data mechanism to be used. "MCAR" by default, "MAR", "MNAR" or "MNARsmask"
    opt: str, 
         For mecha = "MNAR", it indicates how the missing-data mechanism is generated: using a logistic regression ("logistic"), quantile censorship ("quantile") or logistic regression for generating a self-masked MNAR mechanism ("selfmasked").
    p_obs : float
            If mecha = "MAR", or mecha = "MNAR" with opt = "logistic" or "quanti", proportion of variables with *no* missing values that will be used for the logistic masking model.
    q : float
        If mecha = "MNAR" and opt = "quanti", quantile level at which the cuts should occur.
    
    Returns
    ----------
    A dictionnary containing:
    'X_init': the initial data matrix.
    'X_incomp': the data with the generated missing values.
    'mask': a matrix indexing the generated missing values.s
    #\"""
    
    to_torch = torch.is_tensor(X) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        X = X.astype(np.float32)
        X = torch.from_numpy(X)
    
    if mecha == "MAR":
        mask = MAR_mask(X, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(X, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(X, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(X, p_miss).double()
    else:
        mask = (torch.rand(X.shape) < p_miss).double()
    
    X_nas = X.clone()
    X_nas[mask.bool()] = np.nan
    
    return {'X_init': X.double(), 'X_incomp': X_nas.double(), 'mask': mask}
    """

In [None]:
"""
# Create the dataframe
data = {
    'Feature1': [1.5, 2.0, 1.8, 2.0, 2.2],
    'Feature2': [3.2, 2.1, 2.5, 2.9, 3.6],
    'Feature3': [5.1, 4.9, 5.3, 4.7, 5.0],
    'Feature4': [6.5, 6.7, 7.3, 6.4, 6.1],
    'Category': [1, 2, 3, 2, 1]
}

data = pd.DataFrame(data)
"""

In [228]:
def simulate_missing(df, method="nearest-neighbors"):
    def filter_missing(df, axis='rows'):
        if axis == 'rows':
            df = df.dropna(axis=0)
            return df
        elif axis == 'columns' or axis == "cols":
            df = df.dropna(axis=1)    
            return df
        else:
            raise ValueError(f"Invalid axis option '{axis}'. Please choose 'rows' or 'columns'.")
    
    def collect_missing(df):
        df = df[df.isna().any(axis=1)]
        return df
    
    def determine_missing_proportions(df):
        f = []; p = []
        for col in df.columns:
            f.append(col)
            p.append(df[col].isna().sum()/len(df))
        d = dict(zip(f, p))
        return(d)
        
    nm = filter_missing(df)
    cm = collect_missing(df)
    mp = determine_missing_proportions(df)
    
    if method == "nearest-neighbors":
        scaler = StandardScaler()
        
        nm = pd.DataFrame(scaler.fit_transform(nm), index=nm.index)
        cm = pd.DataFrame(scaler.transform(cm), index=cm.index)
        
        counter_dict = dict(zip(cm.index, np.repeat(1, len(cm.index))))
        n_feat = []; distance_list = []; neighbor_list = []
        
        for ind in pd.DataFrame(cm).index:
            n_feat.append(cm.loc[ind].dropna().shape[0])
            
            nbrs = NearestNeighbors(n_neighbors=1).fit(nm[list(pd.DataFrame(cm).loc[ind].dropna().index)])
            distances, indices = nbrs.kneighbors(pd.DataFrame(filter_missing(cm.loc[ind], axis='rows')).T)

            distance_list.append(distances.ravel()[0])
            neighbor_list.append(indices.ravel()[0])

    results = pd.DataFrame([cm.index, n_feat, distance_list, neighbor_list]).T
    results.columns = ["ind", "n_features", "distances", "nn"]
    print(results)
    
    if len(results['nn'].tolist()) == len(set(results['nn'])):
        print("generate_simulated")
    else:
        for item in set(results['nn'][test.duplicated('nn', keep=False)].tolist()): #for each item duplicated 
            print(item)
            for ind in results['ind'][results['n_features'] == min(results['n_features'][results['nn'] == item])]: #get the indices with the minimum number of features
                print(ind)
                print(counter_dict)
                counter_dict[ind] = counter_dict[ind]+1 #add one to the counter
                print(counter_dict)
    
                nbrs = NearestNeighbors(n_neighbors=counter_dict[ind]).fit(nm[list(pd.DataFrame(cm).loc[ind].dropna().index)])
                distances, indices = nbrs.kneighbors(pd.DataFrame(filter_missing(cm.loc[ind], axis='rows')).T)
                print(distances, indices)
    
                dist = distances.ravel()[counter_dict[ind]-1]
                print(dist)
                neighbor = indices.ravel()[counter_dict[ind]-1]
                print(neighbor)
                
                results.loc[(results['nn'] == item) & (results['n_features'] == ind), 'distances'] = dist
                results.loc[(results['nn'] == item) & (results['n_features'] == ind), 'nn'] = neighbor

                print(results)
    return results

In [229]:
results = simulate_missing(data)

   ind  n_features  distances   nn
0  2.0         3.0   1.430593  1.0
1  3.0         4.0   0.877835  1.0
1.0
2.0
{2: 1, 3: 1}
{2: 2, 3: 1}
[[1.4305934  2.10886112]] [[1 6]]
2.1088611235706307
6
   ind  n_features  distances   nn
0  2.0         3.0   1.430593  1.0
1  3.0         4.0   0.877835  1.0


In [230]:
results

Unnamed: 0,ind,n_features,distances,nn
0,2.0,3.0,1.430593,1.0
1,3.0,4.0,0.877835,1.0


In [205]:
if len(results['nn'].tolist()) == len(set(results['nn'])):
    print("generate_simulated")
else:
    for item in set(results['nn'][test.duplicated('nn', keep=False)].tolist()): #for each item duplicated 
        for ind in results['ind'][results['n_features'] == min(results['n_features'][results['nn'] == item])]: #get the indices with the minimum number of features
            counter_dict[ind] = counter_dict[ind]+1 #add one to the counter

            nbrs = NearestNeighbors(n_neighbors=counter_dict[ind]).fit(nm[list(pd.DataFrame(cm).loc[ind].dropna().index)])
            distances, indices = nbrs.kneighbors(pd.DataFrame(filter_missing(cm.loc[ind], axis='rows')).T)

            dist = distances.ravel()[counter_dict[ind]-1]
            neighbor = indices.ravel()[counter_dict[ind]-1]

            results['distances'].iloc[(results[(results['nn'] == item) & (results['n_features'] == ind)]).index[0]] = dist
            results['nn'].iloc[(results[(results['nn'] == item) & (results['n_features'] == ind)]).index[0]] = neighbor

NameError: name 'counter_dict' is not defined

In [169]:
for item in set(results['nn'][test.duplicated('nn', keep=False)].tolist()):
    for ind in results['ind'][results['n_features'] == min(results['n_features'][results['nn'] == item])]:
        print(ind)

2.0


In [202]:
(results[(results['nn'] == 1) & (results['n_features'] == 3.0)]).index[0]

0

In [204]:
results['distances'].iloc[0]

1.4305933955406143

In [177]:
results['nn'] == 1

0    True
1    True
Name: nn, dtype: bool

In [172]:
results

Unnamed: 0,ind,n_features,distances,nn
0,2.0,3.0,1.430593,1.0
1,3.0,4.0,0.877835,1.0


In [141]:
for item in set(test['nn'][test.duplicated('nn', keep=False)].tolist()):
    

1.0


In [132]:
test[test.duplicated('nn', keep=False)].index.tolist()

[0, 1]

In [100]:
pos

[1, 1]

In [51]:
list(cm.index)

[2, 3]

In [102]:
[i for i, x in enumerate(pos) if pos.count(x) > 1]

[0, 1]

In [114]:
[i for i, j in enumerate(pos) if j == set([pos[0], pos[1]]).pop()]

[0, 1]

In [111]:
set([pos[0], pos[1]]).pop()

1

In [38]:
nbrs = NearestNeighbors(n_neighbors=1).fit(nm[list(pd.DataFrame(cm).loc[2].dropna().index)])

In [39]:
distances, indices = nbrs.kneighbors(pd.DataFrame(filter_missing(cm.loc[2], axis='rows')).T)

In [40]:
distances

array([[1.4305934]])

In [41]:
indices

array([[1]])

In [17]:
for ind in cm.index:
    print(ind)

2
3


In [52]:
np.repeat(1, 3)

array([1, 1, 1])

In [56]:
dict(zip(cm.index, np.repeat(1, len(cm.index))))

{2: 1, 3: 1}

In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

def simulate_missing(df, method="nearest-neighbors"):
    def filter_missing(df, axis='rows'):
        if axis == 'rows':
            df = df.dropna(axis=0)
            return df
        elif axis in ('columns', 'cols'):
            df = df.dropna(axis=1)    
            return df
        else:
            raise ValueError(f"Invalid axis option '{axis}'. Please choose 'rows' or 'columns'.")

    def collect_missing(df):
        df = df[df.isna().any(axis=1)]
        return df
    
    def determine_missing_proportions(df):
        f = []; p = []
        for col in df.columns:
            f.append(col)
            p.append(df[col].isna().sum()/len(df))
        d = dict(zip(f, p))
        return d
        
    nm = filter_missing(df)
    cm = collect_missing(df)
    mp = determine_missing_proportions(df)
    
    if method == "nearest-neighbors":
        scaler = StandardScaler()
        
        nm = pd.DataFrame(scaler.fit_transform(nm), index=nm.index)
        cm = pd.DataFrame(scaler.transform(cm), index=cm.index)
        
        counter_dict = dict(zip(cm.index, np.repeat(1, len(cm.index))))
        n_feat = []; distance_list = []; neighbor_list = []
        
        for ind in pd.DataFrame(cm).index:
            n_feat.append(cm.loc[ind].dropna().shape[0])
            
            nbrs = NearestNeighbors(n_neighbors=1).fit(nm[list(pd.DataFrame(cm).loc[ind].dropna().index)])
            distances, indices = nbrs.kneighbors(pd.DataFrame(filter_missing(cm.loc[ind], axis='rows')).T)

            distance_list.append(distances.ravel()[0])
            neighbor_list.append(indices.ravel()[0])

    results = pd.DataFrame([cm.index, n_feat, distance_list, neighbor_list]).T
    results.columns = ["ind", "n_features", "distances", "nn"]

    if len(results[results.duplicated('nn', keep=False)]['nn'].tolist()) == len(set(results[results.duplicated('nn', keep=False)]['nn'].tolist())):
        print("generate_simulated")
    else:
        for item in set(results[results.duplicated('nn', keep=False)]['nn'].tolist()):
            for ind in results[results['n_features'] == min(results[results['nn'] == item]['n_features'])]['ind']:
                counter_dict[ind] = counter_dict[ind] + 1

                nbrs = NearestNeighbors(n_neighbors=counter_dict[ind]).fit(nm[list(pd.DataFrame(cm).loc[ind].dropna().index)])
                distances, indices = nbrs.kneighbors(pd.DataFrame(filter_missing(cm.loc[ind], axis='rows')).T)

                dist = distances.ravel()[counter_dict[ind]-1]
                neighbor = indices.ravel()[counter_dict[ind]-1]

                # Use .iloc to update the values in the 'results' DataFrame
                results.loc[results['ind'] == ind, 'distances'] = dist
                results.loc[results['ind'] == ind, 'nn'] = neighbor
    
    return results

In [16]:
%%time
simulate_missing(data)

CPU times: user 32.1 ms, sys: 71 µs, total: 32.1 ms
Wall time: 31.4 ms


Unnamed: 0,ind,n_features,distances,nn
0,2.0,3.0,2.108861,6.0
1,3.0,4.0,0.877835,1.0


In [235]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

def simulate_missing(df, method="nearest-neighbors"):
    def filter_missing(matrix):
        return [row for row in matrix if not any(np.isnan(row))]
    
    def collect_missing(matrix):
        return [row for row in matrix if any(np.isnan(row))]
    
    def determine_missing_proportions(matrix):
        proportions = []
        for col in range(len(matrix[0])):
            count_missing = sum(1 for row in matrix if np.isnan(row[col]))
            proportions.append(count_missing / len(matrix))
        return proportions
    
    def scale_data(matrix):
        scaler = StandardScaler()
        return scaler.fit_transform(matrix)

    def find_nearest_neighbors(nm, cm):
        counter_dict = {ind: 1 for ind in range(len(cm))}

        n_feat = []
        distance_list = []
        neighbor_list = []

        for ind, cm_row in enumerate(cm):
            n_feat.append(len(cm_row) - np.isnan(cm_row).sum())

            # Flatten and remove NaNs for NearestNeighbors fit
            nm_filtered = nm[:, ~np.isnan(cm_row)]
            cm_row_filtered = cm_row[~np.isnan(cm_row)]

            nbrs = NearestNeighbors(n_neighbors=1).fit(nm_filtered)
            distances, indices = nbrs.kneighbors([cm_row_filtered])

            distance_list.append(distances.ravel()[0])
            neighbor_list.append(indices.ravel()[0])

        return n_feat, distance_list, neighbor_list, counter_dict

    # Convert DataFrame to list of lists
    df_matrix = df.values.tolist()

    # Filter missing and complete rows
    nm = scale_data(filter_missing(df_matrix))
    cm = scale_data(collect_missing(df_matrix))

    n_feat, distance_list, neighbor_list, counter_dict = find_nearest_neighbors(nm, cm)

    results = []
    for i, ind in enumerate(collect_missing(df_matrix)):
        results.append([ind, n_feat[i], distance_list[i], neighbor_list[i]])

    if len(set(nn for _, _, _, nn in results)) == len(results):
        print("generate_simulated")
    else:
        for item in set(nn for _, _, _, nn in results if (nn, _) in results):
            for i, (ind, n_features, _, nn) in enumerate(results):
                if nn == item:
                    counter_dict[ind] += 1

                    # Flatten and remove NaNs for NearestNeighbors fit
                    nm_filtered = nm[:, ~np.isnan(ind)]
                    cm_row_filtered = ind[~np.isnan(ind)]

                    nbrs = NearestNeighbors(n_neighbors=counter_dict[ind]).fit(nm_filtered)
                    distances, indices = nbrs.kneighbors([cm_row_filtered])

                    dist = distances.ravel()[counter_dict[ind]-1]
                    neighbor = indices.ravel()[counter_dict[ind]-1]

                    results[i][2] = dist
                    results[i][3] = neighbor

    # Convert results back to a DataFrame
    results_df = pd.DataFrame(results, columns=["ind", "n_features", "distances", "nn"])

    return results_df

In [236]:
simulate_missing(data)

generate_simulated


Unnamed: 0,ind,n_features,distances,nn
0,"[1.8, nan, 5.3, nan, 3.0]",3,0.997653,7
1,"[nan, 2.9, 4.7, 6.4, 2.0]",4,1.189088,0


In [250]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

def simulate_missing(df, method="nearest-neighbors"):
    def filter_missing(matrix):
        return [row for row in matrix if not any(np.isnan(row))]
    
    def collect_missing(matrix):
        return [row for row in matrix if any(np.isnan(row))]
    
    def determine_missing_proportions(matrix):
        proportions = []
        for col in range(len(matrix[0])):
            count_missing = sum(1 for row in matrix if np.isnan(row[col]))
            proportions.append(count_missing / len(matrix))
        return proportions
    
    # def scale_data(matrix):
    #     scaler = StandardScaler()
    #     return scaler.fit_transform(matrix)

    def find_nearest_neighbors(nm, cm):
        counter_dict = {ind: 1 for ind in range(len(cm))}

        n_feat = []
        distance_list = []
        neighbor_list = []

        for ind, cm_row in enumerate(cm):
            n_feat.append(len(cm_row) - np.isnan(cm_row).sum())

            # Flatten and remove NaNs for NearestNeighbors fit
            nm_filtered = nm[:, ~np.isnan(cm_row)]
            cm_row_filtered = cm_row[~np.isnan(cm_row)]

            nbrs = NearestNeighbors(n_neighbors=1).fit(nm_filtered)
            distances, indices = nbrs.kneighbors([cm_row_filtered])

            distance_list.append(distances.ravel()[0])
            neighbor_list.append(indices.ravel()[0])

        return n_feat, distance_list, neighbor_list, counter_dict

    # Convert DataFrame to list of lists
    df_matrix = df.values.tolist()
    print(df_matrix)
    
    # Filter missing and complete rows
    scaler = StandardScaler()
    
    nm = scaler.fit_transform(filter_missing(df_matrix))
    print(nm)
    cm = scaler.transform(collect_missing(df_matrix))
    print(cm)
    
    n_feat, distance_list, neighbor_list, counter_dict = find_nearest_neighbors(nm, cm)

    results = []
    for i, ind in enumerate(collect_missing(df_matrix)):
        results.append([ind, n_feat[i], distance_list[i], neighbor_list[i]])

    print(results)
    
    if len(set(nn for _, _, _, nn in results)) == len(results):
        print("generate_simulated")
    else:
        for item in set(nn for _, _, _, nn in results if (nn, _) in results):
            for i, (ind, n_features, _, nn) in enumerate(results):
                if nn == item:
                    counter_dict[ind] += 1

                    # Flatten and remove NaNs for NearestNeighbors fit
                    nm_filtered = nm[:, ~np.isnan(ind)]
                    cm_row_filtered = ind[~np.isnan(ind)]

                    nbrs = NearestNeighbors(n_neighbors=counter_dict[ind]).fit(nm_filtered)
                    distances, indices = nbrs.kneighbors([cm_row_filtered])

                    dist = distances.ravel()[counter_dict[ind]-1]
                    neighbor = indices.ravel()[counter_dict[ind]-1]

                    results[i][2] = dist
                    results[i][3] = neighbor

    # Convert results back to a DataFrame
    results_df = pd.DataFrame(results, columns=["ind", "n_features", "distances", "nn"])
    results_df = results_df.astype({"n_features": int, "nn": int})

    return results_df


In [248]:
data

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category
0,1.5,3.2,5.1,6.5,1
1,2.0,2.5,4.9,6.7,2
2,1.8,,5.3,,3
3,,2.9,4.7,6.4,2
4,2.2,3.6,5.0,6.1,1
5,3.7,4.2,5.2,7.0,2
6,4.1,4.8,6.2,6.1,3
7,4.3,4.4,6.9,7.2,1
8,3.3,4.7,5.8,7.5,2
9,3.9,4.6,6.0,6.9,3


In [251]:
simulate_missing(data)

[[1.5, 3.2, 5.1, 6.5, 1.0], [2.0, 2.5, 4.9, 6.7, 2.0], [1.8, nan, 5.3, nan, 3.0], [nan, 2.9, 4.7, 6.4, 2.0], [2.2, 3.6, 5.0, 6.1, 1.0], [3.7, 4.2, 5.2, 7.0, 2.0], [4.1, 4.8, 6.2, 6.1, 3.0], [4.3, 4.4, 6.9, 7.2, 1.0], [3.3, 4.7, 5.8, 7.5, 2.0], [3.9, 4.6, 6.0, 6.9, 3.0]]
[[-1.6194427  -1.03931168 -0.81276877 -0.53300179 -1.12089708]
 [-1.12115264 -1.94870941 -1.11519436 -0.10660036  0.16012815]
 [-0.92183661 -0.51965584 -0.96398156 -1.38580466 -1.12089708]
 [ 0.57303357  0.25982792 -0.66155597  0.53300179  0.16012815]
 [ 0.97166562  1.03931168  0.85057197 -1.38580466  1.44115338]
 [ 1.17098165  0.51965584  1.90906152  0.95940322 -1.12089708]
 [ 0.17440152  0.90939772  0.24572079  1.59900537  0.16012815]
 [ 0.7723496   0.77948376  0.54814638  0.31980107  1.44115338]]
[[-1.32046866         nan -0.51034318         nan  1.44115338]
 [        nan -1.42905357 -1.41761994 -0.74620251  0.16012815]]
[[[1.8, nan, 5.3, nan, 3.0], 3, 1.4305933955406143, 1], [[nan, 2.9, 4.7, 6.4, 2.0], 4, 0.87783502

Unnamed: 0,ind,n_features,distances,nn
0,"[1.8, nan, 5.3, nan, 3.0]",3,1.430593,1
1,"[nan, 2.9, 4.7, 6.4, 2.0]",4,0.877835,1


In [19]:
import pandas as pd
import numpy as np
import torch
#import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

def simulate_missing(df, method="nearest-neighbors"):
    def filter_missing(df, axis='rows'):
        if axis == 'rows':
            df = df.dropna(axis=0)
            return df
        elif axis in ('columns', 'cols'):
            df = df.dropna(axis=1)    
            return df
        else:
            raise ValueError(f"Invalid axis option '{axis}'. Please choose 'rows' or 'columns'.")

    def collect_missing(df):
        df = df[df.isna().any(axis=1)]
        return df
    
    def determine_missing_proportions(df):
        f = []; p = []
        for col in df.columns:
            f.append(col)
            p.append(df[col].isna().sum()/len(df))
        d = dict(zip(f, p))
        return d
        
    nm = filter_missing(df)
    cm = collect_missing(df)
    mp = determine_missing_proportions(df)
    
    if method == "nearest-neighbors":
        scaler = StandardScaler()
        
        nm = pd.DataFrame(scaler.fit_transform(nm), index=nm.index)
        cm = pd.DataFrame(scaler.transform(cm), index=cm.index)
        
        counter_dict = dict(zip(cm.index, np.repeat(1, len(cm.index))))
        n_feat = []; distance_list = []; neighbor_list = []

        nm_tensor = torch.tensor(nm[list(pd.DataFrame(cm).columns)].values)
        cm_tensor = torch.tensor(cm.values)

        for i in range(cm.shape[0]):
            ind = cm.index[i]
            mask = ~torch.isnan(cm_tensor[i])
            n_feat.append(torch.count_nonzero(mask))

            nm_filtered = nm_tensor[:, mask]
            cm_filtered = cm_tensor[i, mask].unsqueeze(1)

            nbrs = NearestNeighbors(n_neighbors=1).fit(nm_filtered)
            distances, indices = nbrs.kneighbors(cm_filtered.T)

            distance_list.append(distances.ravel()[0])
            neighbor_list.append(indices.ravel()[0])

    results = pd.DataFrame([cm.index, n_feat, distance_list, neighbor_list]).T
    results.columns = ["ind", "n_features", "distances", "nn"]

    print(results)
    if len(results[results.duplicated('nn', keep=False)]['nn'].tolist()) == len(set(results[results.duplicated('nn', keep=False)]['nn'].tolist())):
        print("generate_simulated")
    else:
        for item in set(results[results.duplicated('nn', keep=False)]['nn'].tolist()):
            min_n_features = torch.min(torch.tensor(results[results['nn'] == item]['n_features'].tolist()))
            for ind in results[results['n_features'] == min_n_features]['ind']:
                counter_dict[ind] = counter_dict[ind] + 1

                mask = ~torch.isnan(cm_tensor[i])
                nm_filtered = nm_tensor[:, mask]
                cm_filtered = cm_tensor[i, mask].unsqueeze(1)

                nbrs = NearestNeighbors(n_neighbors=counter_dict[ind]).fit(nm_filtered)
                distances, indices = nbrs.kneighbors(cm_filtered.T)

                dist = distances.ravel()[counter_dict[ind]-1]
                neighbor = indices.ravel()[counter_dict[ind]-1]

                # Use .iloc to update the values in the 'results' DataFrame
                results.loc[results['ind'] == ind, 'distances'] = dist
                results.loc[results['ind'] == ind, 'nn'] = neighbor
    
    return results


In [20]:
%%time
simulate_missing(data)

  ind n_features distances nn
0   2  tensor(3)  1.430593  1
1   3  tensor(4)  0.877835  1
CPU times: user 25.6 ms, sys: 435 µs, total: 26 ms
Wall time: 25 ms


Unnamed: 0,ind,n_features,distances,nn
0,2,tensor(3),1.484663,0
1,3,tensor(4),0.877835,1
