In [8]:
from Bio import SeqIO
import itertools
alphabet = 'ACDEFGHIKLMNPQRSTVWY'
alphabet_list = list('ACDEFGHIKLMNPQRSTVWY')

In [198]:
names_file = 'yob_all.txt'
human_proteome_file = 'uniprot-reviewed%3Ayes+AND+proteome%3Aup000005640.fasta'
chosen_gender = 'F'
max_Kmer = 5
min_Kmer = 3
year_cutoff = 1970
min_count = 1000

In [192]:
# Make all combinations of "amino acid names":
combi_names = set()
for i in range(min_Kmer, max_Kmer+1):
    for name in itertools.product(alphabet_list, repeat=i):
        combi_names.add(''.join(name))
print('Number of names: {}'.format(len(combi_names)))

Number of names: 3368000


In [193]:
# Read all US names according to social security registration:
US_names = dict()
for l in open(names_file):
    l = l.strip()
    year, name, gender, count = l.split(',')
    key = (name.upper(), gender)
    if key in US_names:
        US_names[key].append({int(year): int(count)})
    else:
        US_names[key] = [{int(year): int(count)}]

In [199]:
# Subset the "amino acid names" according to the minimum popularity:
US_names_sub = dict()
for k in US_names:
    # Filter on gender and being composed of amino acids:
    if k[1] == chosen_gender and k[0] in combi_names:
        sum_count = 0
        for years in US_names[k]:
            if list(years.keys())[0] >= year_cutoff:
                sum_count += list(years.values())[0]
        if sum_count >= min_count:
            US_names_sub[k[0]] = sum_count
US_names_sub = sorted(US_names_sub.items(), key=lambda x: x[1], reverse=True)
print('The top 10 "amino acid names" out of {} total names.'.format(len(US_names_sub)))
print('{:6}{}'.format(*['NAME', 'COUNT']))
print('\n'.join(['{:6}{}'.format(*n) for n in US_names_sub[:10]]))

The top 10 "amino acid names" out of 1145 total names.
NAME  COUNT
SARAH 775331
EMILY 730071
AMY   514554
MEGAN 427797
EMMA  393085
LISA  368363
MARY  362288
KAYLA 331985
KELLY 326185
ANNA  302210


In [200]:
# Read the human proteome
proteome_dict = dict()
for record in SeqIO.parse(human_proteome_file, 'fasta'):
    seq = str(record.seq)
    ID = str(record.id)
    proteome_dict[ID] = seq.upper()

# Subset by presence in the human proteome:
sub = dict()
for t in US_names_sub:
    name, count = t
    found = False
    for seq, ID in proteome_dict.items():
        if name in seq:
            found = True
            break
    if found:
        sub[name] = count
US_names_sub = sorted(sub.items(), key=lambda x: x[1], reverse=True)
print('The top 10 "amino acid names" out of {} total names after subsetting by presence in the human proteome.'.format(len(US_names_sub)))
print('{:6}{}'.format(*['NAME', 'COUNT']))
print('\n'.join(['{:6}{}'.format(*n) for n in US_names_sub[:10]]))

The top 10 "amino acid names" out of 79 total names after subsetting by presence in the human proteome.
NAME  COUNT
AMY   514554
SARA  293058
AVA   221360
MIA   198225
TARA  155459
TINA  122131
ANA   83581
EVA   70469
MYA   51430
IVY   33735


In [201]:
from Bio.Align.Applications import MafftCommandline
import os
fname = 'tmp.fasta'
out_fasta = 'names.fasta'
with open(fname, 'w') as fho:
    for i, t in enumerate(US_names_sub):
        name, count = t
        fho.write('>{}_{}\n{}\n'.format(name, count, name))
mafft_cline = MafftCommandline(input=fname)
mafft_results = mafft_cline()[0]
with open(out_fasta, 'w') as fho:
    fho.write(mafft_results[:-1])
os.remove(fname)

In [202]:
seqs = list()
for line in mafft_results.split('\n'):
    if not line.startswith('>') and line != '':
        seqs.append(line.strip())

In [144]:
%%bash
ete3 build -w standard_fasttree -a names.fasta -o names_tree/ --clearall

Traceback (most recent call last):
  File "/usr/local/bin/ete3", line 7, in <module>
    from ete3.tools.ete import main
  File "/usr/local/lib/python3.6/site-packages/ete3/tools/ete.py", line 54, in <module>
    from . import (ete_split, ete_expand, ete_annotate, ete_ncbiquery, ete_view,
  File "/usr/local/lib/python3.6/site-packages/ete3/tools/ete_view.py", line 48, in <module>
    from .. import (Tree, PhyloTree, TextFace, RectFace, faces, TreeStyle, CircleFace, AttrFace,
ImportError: cannot import name 'TextFace'


In [203]:
from jellyfish import levenshtein_distance
import numpy as np
from sklearn.cluster import dbscan
names = [el[0] for el in US_names_sub]
def lev_metric(x, y):
    i, j = int(x[0]), int(y[0])     # extract indices
    return(levenshtein_distance(names[i], names[j]))
X = np.arange(len(names)).reshape(-1, 1)
db = dbscan(X, metric=lev_metric, eps=1, min_samples=1)

In [215]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=10)
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [217]:
km.cluster_centers_

array([[ 36. ],
       [ 67.5],
       [ 10.5],
       [ 52. ],
       [ 27. ],
       [  3. ],
       [ 75. ],
       [ 44.5],
       [ 59.5],
       [ 18.5]])

In [204]:
db

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78]),
 array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,
         2,  0,  0,  0,  0,  0,  0,  0,  3,  4,  0,  0,  0,  0,  3,  5,  0,
         0,  0,  0,  0,  0,  1,  6,  0,  0,  0,  0,  7,  0,  0,  0,  0,  8,
         0,  9,  0,  0, 10,  0,  0,  0, 11, 12,  0,  0,  0,  0, 13,  0, 14,
         0, 15,  0,  0,  0, 16,  0,  0,  0,  0,  0]))

In [205]:
name_clusters = dict()
for i, el in enumerate(US_names_sub):
    name, count = el
    cl_idx = db[1][i]
    if cl_idx in name_clusters:
        name_clusters[cl_idx].append((name, count))
    else:
        name_clusters[cl_idx] = [(name, count)]

name_clusters = {k: sorted(v, key=lambda x: x[1], reverse=True) for k, v in name_clusters.items()}

In [206]:
levenshtein_distance('AMY', 'SARA')

3

In [207]:
name_clusters

{0: [('AMY', 514554),
  ('SARA', 293058),
  ('AVA', 221360),
  ('MIA', 198225),
  ('TARA', 155459),
  ('TINA', 122131),
  ('ANA', 83581),
  ('EVA', 70469),
  ('MYA', 51430),
  ('IVY', 33735),
  ('TIA', 29766),
  ('KIM', 27431),
  ('RITA', 23339),
  ('LEA', 19660),
  ('LIA', 15987),
  ('CALI', 14997),
  ('ADA', 13620),
  ('ALI', 9633),
  ('TERA', 8802),
  ('IDA', 7336),
  ('KIA', 7140),
  ('MAE', 6639),
  ('AIDA', 6448),
  ('KAI', 5534),
  ('RAE', 5284),
  ('NYA', 5055),
  ('MAI', 4614),
  ('KERA', 3273),
  ('RIA', 3054),
  ('MEG', 2956),
  ('EMA', 2751),
  ('ILA', 2672),
  ('ARI', 2664),
  ('EMI', 2456),
  ('MICA', 2337),
  ('TEA', 2318),
  ('IVA', 2250),
  ('INA', 2134),
  ('TALA', 2092),
  ('PIA', 2088),
  ('NADA', 2082),
  ('ADAM', 1922),
  ('TAI', 1675),
  ('ERMA', 1587),
  ('DIA', 1527),
  ('TENA', 1459),
  ('SERA', 1449),
  ('MEA', 1421),
  ('ANH', 1399),
  ('ELA', 1384),
  ('CERA', 1337),
  ('LITA', 1267),
  ('SIA', 1206),
  ('ENA', 1187),
  ('TEAL', 1179),
  ('MEI', 1126),
  ('