In [0]:
from scipy.spatial import distance

def tomek_links(data, target):
    """
    Implement a function that returns a set with data indices that participate in the tomek links of the data. E.g. if
    in the dataset the first and third training samples, along with the fourth and sixth samples are the only two tomek
    links in the data, it should return the set {0, 2, 3, 5}.
    :param data: a numpy array of shape n_samples x n_features with the features
    :param target: a numpy array of shape n_samples 1 with the targets
    :return: a set with data indices that belong to the tomek links of the data
    """
    '''
    In words, instances a and b define a Tomek Link if: 
    (i) instance a’s nearest neighbor is b, 
    (ii) instance b’s nearest neighbor is a, 
    (iii) instances a and b belong to different classes.
    '''
    closets_neighbors = []
    # Looping the data, in order to fill the list closets_neighbors, containing the index of the closest neighbor of each data point
    for i, point_a in enumerate(data):      
      closest_neighbor = 0
      closest_dist = distance.euclidean(point_a, data[0]) + distance.euclidean(point_a, data[1])
      for y, point_b in enumerate(data):
        dist = distance.euclidean(point_a, point_b)        
        if(dist < closest_dist): # if the distance between the two points is smaller than the smallest
          if(i != y): # if that point is other than its self
            closest_neighbor = y # save it as the closest neighbor, for the moment
            closest_dist = dist

      closets_neighbors.append(closest_neighbor) # save the closest neighbor of current data point
    
    # At that point we have the closest neighbor of each point and we want to check for 2 things. 
    # First it goes the other way too. Meaning if that point is also the closest neighbor of its neighbor. And second if they belong to different class
    tomek_links = []
    for i, closest_neighbor in enumerate(closets_neighbors):
      if(closets_neighbors[closest_neighbor] == i): # First check
        if(target[i] != target[closest_neighbor]): # Second check
          tomek_links.append(i)
          tomek_links.append(closest_neighbor)

    return set(tomek_links)    
    

In [49]:
from sklearn.datasets import load_digits

digits = load_digits()
tomek_links(digits.data, digits.target)

{1575, 1582}