In [None]:
# AnnData._core.utils::make_index_unique()
# 函数作用：
# 

def make_index_unique(index: pd.Index, join: str = "-"):
    """
    Makes the index unique by appending a number string to each duplicate index element:
    '1', '2', etc.

    If a tentative name created by the algorithm already exists in the index, it tries
    the next integer in the sequence.

    The first occurrence of a non-unique value is ignored.

    Parameters
    ----------
    join
         The connecting string between name and integer.

    Examples
    --------
    >>> from anndata import AnnData
    >>> adata = AnnData(np.ones((2, 3)), var=pd.DataFrame(index=["a", "a", "b"]))
    >>> adata.var_names
    Index(['a', 'a', 'b'], dtype='object')
    >>> adata.var_names_make_unique()
    >>> adata.var_names
    Index(['a', 'a-1', 'b'], dtype='object')
    """
    if index.is_unique:
        return index
    from collections import Counter

    values = index.values.copy()
    indices_dup = index.duplicated(keep="first")
    values_dup = values[indices_dup]
    values_set = set(values)
    counter = Counter()
    issue_interpretation_warning = False
    example_colliding_values = []
    for i, v in enumerate(values_dup):
        while True:
            counter[v] += 1
            tentative_new_name = v + join + str(counter[v])
            if tentative_new_name not in values_set:
                values_set.add(tentative_new_name)
                values_dup[i] = tentative_new_name
                break
            issue_interpretation_warning = True
            if len(example_colliding_values) < 5:
                example_colliding_values.append(tentative_new_name)

    if issue_interpretation_warning:
        msg = (
            f"Suffix used ({join}[0-9]+) to deduplicate index values may make index values difficult to interpret. "
            "There values with a similar suffixes in the index. "
            "Consider using a different delimiter by passing `join={delimiter}`. "
            "Example key collisions generated by the make_index_unique algorithm: "
            f"{example_colliding_values}"
        )
        # 3: caller -> 2: `{obs,var}_names_make_unique` -> 1: here
        warnings.warn(msg, UserWarning, stacklevel=3)
    values[indices_dup] = values_dup
    index = pd.Index(values, name=index.name)
    return index

In [22]:
# 源码解析
import pandas as pd
from collections import Counter
from collections import defaultdict

join = '-'
# 创建一个索引
index = pd.Index(['a', 'b', 'c', 'a-1', 'a', 'b', 'd', 'b', 'c', 'e'])
print(index)
# 获取索引的值
values = index.values.copy()
print("values", values)
# 标记重复的元素，保留第1个，即除第1个外其他重复元素标记为True
indices_dup = index.duplicated(keep='first')
print(indices_dup)
# 取出重复元素的值
values_dup = values[indices_dup]
print("values_dup", values_dup)
# 所有元素通过集合去重
values_set = set(values)
# 初始化一个计数器。计数器是一个dict的子类，具有字典的性质。使用一个默认值为int整数的默认字典也可以实现计数。
counter = Counter()
#counter = defaultdict(int)
# 循环处理每一个重复元素
for i, v in enumerate(values_dup):
    # 通过while循环，自动尝试下一个数字，直到成功
    while True:
        counter[v] += 1
        tentative_new_name = v + join + str(counter[v])
        print(i, v, counter[v], tentative_new_name, counter)
        if tentative_new_name not in values_set:
            values_set.add(tentative_new_name)
            values_dup[i] = tentative_new_name
            break

print("values_dup", values_dup)
values[indices_dup] = values_dup
print(index)
index = pd.Index(values, name=index.name)
print(index)


Index(['a', 'b', 'c', 'a-1', 'a', 'b', 'd', 'b', 'c', 'e'], dtype='object')
values ['a' 'b' 'c' 'a-1' 'a' 'b' 'd' 'b' 'c' 'e']
[False False False False  True  True False  True  True False]
values_dup ['a' 'b' 'b' 'c']
0 a 1 a-1 Counter({'a': 1})
0 a 2 a-2 Counter({'a': 2})
1 b 1 b-1 Counter({'a': 2, 'b': 1})
2 b 2 b-2 Counter({'a': 2, 'b': 2})
3 c 1 c-1 Counter({'a': 2, 'b': 2, 'c': 1})
values_dup ['a-2' 'b-1' 'b-2' 'c-1']
Index(['a', 'b', 'c', 'a-1', 'a', 'b', 'd', 'b', 'c', 'e'], dtype='object')
Index(['a', 'b', 'c', 'a-1', 'a-2', 'b-1', 'd', 'b-2', 'c-1', 'e'], dtype='object')


In [None]:
values

## 总结

标记重复元素，取出重复元素，再通过一个计数器向重复元素后面添加数字，最后替换重复元素，创建新的索引。

## 关键知识点

- 一维数组的取值和赋值，通过一维布尔数组：

```
values_dup = values[indices_dup]
修改 values_dup
values[indices_dup] = values_dup, 标记为True的元素与右边的值一一对应
```

valuse:

array(['a', 'b', 'c', 'a-1', 'a', 'b', 'd', 'b', 'c', 'e'], dtype=object)

indices_dup:

array([False, False, False, False,  True,  True, False,  True,  True,
       False])


- 为每一个元素使用一个计数器

```
from collections import Counter
from collections import defaultdict

counter = Counter()
counter = defaultdict(int)
```

计数器默认值为 0
```
首次遇到v时：
counter[v] = counter[v] + 1 = 0 + 1 = 1
再次遇到v时：
counter[v] = counter[v] + 1 = 1 + 1 = 2
以此类推
```

最后效果，每个元素从1开始尝试，不断+1尝试下一个，直到成功。最终重复元素都会加上一个数字后缀。