In [1]:
#source data: https://github.com/orgtre/google-books-ngram-frequency/tree/main
#potential source https://github.com/krishnakt031990/Crawl-Wiki-For-Acronyms
#potential source https://blog.gdeltproject.org/announcing-the-new-web-news-ngrams-3-0-dataset/

In [2]:
import csv
from itertools import combinations
import math

#Keys grouped by finger with the Colemak-DH layout
keygroup9 = {'q':['KC_Q', 0, 0],'a':['KC_A', 1, 0],'z':['KC_Z', 2, 0],'`':['KC_GRV', 0, 1]}
keygroup7 = {'w':['KC_W', 0, 0],'r':['KC_R', 1, 0],'x':['KC_X', 2, 0]}
keygroup5 = {'f':['KC_F', 0, 0],'s':['KC_S', 1, 0],'c':['KC_C', 2, 0]}
keygroup3 = {'p':['KC_P', 0, 0],'t':['KC_T', 1, 0],'d':['KC_D', 2, 0],'b':['KC_B', 0, 1],'g':['KC_G', 1, 1],'v':['KC_V', 2, 1]}
keygroup2 = {'l':['KC_L', 0, 0],'n':['KC_N', 1, 0],'h':['KC_H', 2, 0],'j':['KC_J', 0, 1],'m':['KC_M', 1, 1],'k':['KC_K', 2, 1]}
keygroup4 = {'u':['KC_U', 0, 0],'e':['KC_E', 1, 0],',':['KC_COMM', 2, 0]}
keygroup6 = {'y':['KC_Y', 0, 0],'i':['KC_I', 1, 0],'.':['KC_DOT', 2, 0]}
keygroup8 = {';':['KC_SCLN', 0, 0],'o':['KC_O', 1, 0],'/':['KC_SLSH', 2, 0],'\\':['KC_BSLS', 2, 1],"'":['KC_QUOT', 1, 1],
             '-':['KC_MINS', 0, 1]}

output_f = 'combos_colemakDH_standard.def'
#how many do you want to return
combo_list_limit = 30000
shingle_w = 1#0.15
multi_w = 0.25

#A thumb key that can be used in combos without affecting typing (tab, enter or a function key could also work)
#Change to None if this will not work for your keyboard, you just will have less combos
#Tried using a function key could allow 2 key combos for heavy typing but it is a large investment for such a valuable position
special_key = 'KC_BSPC'
special_key = None

#space will use the other thumb
space_key = 'KC_SPC'

#force abbreviations for known issues/preferances
item_abrevs = {
    ("KC_BSPC", " "): "the",
    ("k", "b"): "keyboard",
    ("k", "b", " "): "keyboard ",
    #("k", "y", " "): "key ", -possibly add space/comma/apostrophe s for 3 letter words with banned 2 char combos
    # also need to add "n't" words to the 1 grams
    # SQL
    ("s", "l", " "): "select ",
    ("w", "e", " "): "where ",
    ("u", "p", " "): "update ",
    ("a", "p", " "): "append ",
    ("c", "t", " "): "create ",
    ("i", "s", " "): "insert ",
    ("d", "a", "s"): "database",
    ("d", "e", " "): "delete ",
    ("g", "y", " "): "group by ",
    ("h", "g", " "): "having ",
    ("l", "e", " "): "like ",
    ("o", "b", " "): "order by ",
    ("d", "s", " "): "distinct ",
    ("v", "l", " "): "values ",
    ("u", "n", " "): "union ",
    ("t", "l", " "): "table ",
    ("j", "o", " "): "join ",
    # Python
    ("t", "u", " "): "true ",
    ("f", "l", " "): "false ",
    ("v", "l", "u"): "value",
    ("n", "e", " "): "none ",
    ("w", "l", " "): "while ",
    ("e", "f", " "): "elif ",
    ("e", "l", "s"): "else",
    ("e", "x", "c"): "except",
    ("l", "d", " "): "lambda ",
}
#item_abrevs = {("KC_BSPC", " "): "the"} #this combo can accidentally trigger in typing
item_abrevs = {}

item_add_weight = 0.1
item_add = [
    "the",
    "keyboard",
    "select",
    "where",
    "update",
    "append",
    "create",
    "insert",
    "database",
    "delete",
    "group by",
    "having",
    "like ",
    "order by",
    "distinct",
    "values",
    "union",
    "table",
    "join",
    "true",
    "false",
    "value",
    "none",
    "while",
    "elif",
    "else",
    "except",
    "lambda",
]
item_add = []

left_groups = [1, 3, 5, 7, 9]
right_groups = [0, 2, 4, 6, 8]

usable_keys = {}
usable_keys[' '] = {'kc': space_key, 'group': 1, 'row': 0, 'column': 0}
usable_keys[special_key] = {'kc': special_key, 'group': 0, 'row': 0, 'column': 0}

for key, val in keygroup2.items():
    usable_keys[key] = {'kc': val[0], 'group': 2, 'row': val[1], 'column': val[2]}
for key, val in keygroup3.items():
    usable_keys[key] = {'kc': val[0], 'group': 3, 'row': val[1], 'column': val[2]}
for key, val in keygroup4.items():
    usable_keys[key] = {'kc': val[0], 'group': 4, 'row': val[1], 'column': val[2]}
for key, val in keygroup5.items():
    usable_keys[key] = {'kc': val[0], 'group': 5, 'row': val[1], 'column': val[2]}
for key, val in keygroup6.items():
    usable_keys[key] = {'kc': val[0], 'group': 6, 'row': val[1], 'column': val[2]}
for key, val in keygroup7.items():
    usable_keys[key] = {'kc': val[0], 'group': 7, 'row': val[1], 'column': val[2]}
for key, val in keygroup8.items():
    usable_keys[key] = {'kc': val[0], 'group': 8, 'row': val[1], 'column': val[2]}
for key, val in keygroup9.items():
    usable_keys[key] = {'kc': val[0], 'group': 9, 'row': val[1], 'column': val[2]}

In [3]:
import csv
from itertools import combinations
import math

#Keys grouped by finger with the QWERTY layout
keygroup9 = {'q':['KC_Q', 0, 0],'a':['KC_A', 1, 0],'z':['KC_Z', 2, 0],'`':['KC_GRV', 0, 1]}
keygroup7 = {'w':['KC_W', 0, 0],'s':['KC_S', 1, 0],'x':['KC_X', 2, 0]}
keygroup5 = {'e':['KC_E', 0, 0],'d':['KC_D', 1, 0],'c':['KC_C', 2, 0]}
keygroup3 = {'r':['KC_R', 0, 0],'f':['KC_F', 1, 0],'v':['KC_V', 2, 0],'t':['KC_T', 0, 1],'g':['KC_G', 1, 1],'b':['KC_B', 2, 1]}
keygroup2 = {'u':['KC_U', 0, 0],'j':['KC_J', 1, 0],'m':['KC_M', 2, 0],'y':['KC_Y', 0, 1],'h':['KC_H', 1, 1],'n':['KC_N', 2, 1]}
keygroup4 = {'i':['KC_I', 0, 0],'k':['KC_K', 1, 0],',':['KC_COMM', 2, 0]}
keygroup6 = {'o':['KC_O', 0, 0],'l':['KC_L', 1, 0],'.':['KC_DOT', 2, 0]}
keygroup8 = {'p':['KC_P', 0, 0],';':['KC_SCLN', 1, 0],'/':['KC_SLSH', 2, 0],'\\':['KC_BSLS', 2, 1],"'":['KC_QUOT', 1, 1],
             '-':['KC_MINS', 0, 1]}

output_f = 'combos_qwerty_standard.def'
#how many do you want to return
combo_list_limit = 30000

#A thumb key that can be used in combos without affecting typing (tab, enter or a function key could also work)
#Change to None if this will not work for your keyboard, you just will have less combos
#Tried using a function key could allow 2 key combos for heavy typing but it is a large investment for such a valuable position
special_key = 'KC_BSPC'
special_key = None

#space will use the other thumb
space_key = 'KC_SPC'

#force abbreviations for known issues/preferances
item_abrevs = {
    ("KC_BSPC", " "): "the",
    ("k", "b"): "keyboard",
    ("k", "b", " "): "keyboard ",
    #("k", "y", " "): "key ", -possibly add space/comma/apostrophe s for 3 letter words with banned 2 char combos
    # also need to add "n't" words to the 1 grams
    # SQL
    ("s", "l", " "): "select ",
    ("w", "e", " "): "where ",
    ("u", "p", " "): "update ",
    ("a", "p", " "): "append ",
    ("c", "t", " "): "create ",
    ("i", "s", " "): "insert ",
    ("d", "a", "s"): "database",
    ("d", "e", " "): "delete ",
    ("g", "y", " "): "group by ",
    ("h", "g", " "): "having ",
    ("l", "e", " "): "like ",
    ("o", "b", " "): "order by ",
    ("d", "s", " "): "distinct ",
    ("v", "l", " "): "values ",
    ("u", "n", " "): "union ",
    ("t", "l", " "): "table ",
    ("j", "o", " "): "join ",
    # Python
    ("t", "u", " "): "true ",
    ("f", "l", " "): "false ",
    ("v", "l", "u"): "value",
    ("n", "e", " "): "none ",
    ("w", "l", " "): "while ",
    ("e", "f", " "): "elif ",
    ("e", "l", "s"): "else",
    ("e", "x", "c"): "except",
    ("l", "d", " "): "lambda ",
}
#item_abrevs = {("KC_BSPC", " "): "the"} #this combo can accidentally trigger in typing
item_abrevs = {}

item_add_weight = 0.1
item_add = [
    "the",
    "keyboard",
    "select",
    "where",
    "update",
    "append",
    "create",
    "insert",
    "database",
    "delete",
    "group by",
    "having",
    "like ",
    "order by",
    "distinct",
    "values",
    "union",
    "table",
    "join",
    "true",
    "false",
    "value",
    "none",
    "while",
    "elif",
    "else",
    "except",
    "lambda",
]
item_add = []

left_groups = [1, 3, 5, 7, 9]
right_groups = [0, 2, 4, 6, 8]

usable_keys = {}
usable_keys[' '] = {'kc': space_key, 'group': 1, 'row': 0, 'column': 0}
usable_keys[special_key] = {'kc': special_key, 'group': 0, 'row': 0, 'column': 0}

for key, val in keygroup2.items():
    usable_keys[key] = {'kc': val[0], 'group': 2, 'row': val[1], 'column': val[2]}
for key, val in keygroup3.items():
    usable_keys[key] = {'kc': val[0], 'group': 3, 'row': val[1], 'column': val[2]}
for key, val in keygroup4.items():
    usable_keys[key] = {'kc': val[0], 'group': 4, 'row': val[1], 'column': val[2]}
for key, val in keygroup5.items():
    usable_keys[key] = {'kc': val[0], 'group': 5, 'row': val[1], 'column': val[2]}
for key, val in keygroup6.items():
    usable_keys[key] = {'kc': val[0], 'group': 6, 'row': val[1], 'column': val[2]}
for key, val in keygroup7.items():
    usable_keys[key] = {'kc': val[0], 'group': 7, 'row': val[1], 'column': val[2]}
for key, val in keygroup8.items():
    usable_keys[key] = {'kc': val[0], 'group': 8, 'row': val[1], 'column': val[2]}
for key, val in keygroup9.items():
    usable_keys[key] = {'kc': val[0], 'group': 9, 'row': val[1], 'column': val[2]}

In [4]:
def get_gram_dict(file, dictname, multi, file_w):
    with open(file, mode ='r')as file:
        csvFile = csv.reader(file)
        i = 0
        for lines in csvFile:
            i += 1
            #if i > 10000:
                #break
            bad_chars = False
            for let in lines[0]:
                if let not in usable_keys.keys():
                    bad_chars = True
            if bad_chars is True:
                continue
            if i > 1:
                if allow_space is True:
                    word = lines[0] + " "
                else:
                    word = lines[0]
                if multi is True:
                    dictname[word] = int(lines[1]) * multi_w * file_w
                else:
                    dictname[word] = int(lines[1]) * file_w
                if i < 10000 and multi is False:
                    shingle_words.append(word)
            
def get_similar_item(test_item):
    sim_item_found = bool
    sim_item_found = False
    sim_item = ""
    sim_item_full = ""
    final_combo = ""
    for key, item in item_abrevs.items():
        if item in test_item:
            sim_item_full = item
            sim_item = item
            added_let = test_item.replace(item,'').lower()
            sim_key = key
            sim_item_found = True
            break
        else:
            if sim_item == "":
                min_let = round(len(item) * .75)
            elif len(item) <= len(sim_item):
                continue
            else:
                min_let = len(sim_item) + 1
            if len(item) < 4:
                min_let = len(item)
            for end_let in range(min_let, (len(item))):
                start = end_let - min_let
                check = item[start:end_let]
                if check in test_item:
                    sim_item_full = item
                    sim_item = check
                    added_let = test_item.replace(check,'').lower()
                    sim_key = key
                    sim_item_found = True
                    next_let = start + len(check)
                    next_check = check + item[next_let]
                    if next_check in test_item:
                        sim_item_full = item
                        sim_item = check
                        added_let = test_item.replace(check,'').lower()
                        sim_key = key
                    else:
                        break
    temp_chars = set()
    temp_dict = {}
    if sim_item_found is True:
        #if test_item == 'your':
            #print(test_item, sim_item)
        for char in sim_key:
            if char in test_item or char == special_key:
                temp_chars.add(char)
        for check_letter in added_let:
            if check_letter not in key:
                temp_dict[check_letter] = letter_ranks[check_letter]
        first_let = None
        while len(temp_dict.keys()) > 0:
            combo = min(temp_dict, key=temp_dict.get)
            if first_let == None:
                first_let = tuple(combo)
            test = tuple(combo)
            del temp_dict[combo]
            combo = test + tuple(temp_chars)
            combo = tuple(sorted(combo))
            test_combo = check_combo(combo, test_item)
            #swaps letters when it uses the same finger like w and h for 'where' and 'here'
            if test_combo is False and len(temp_dict.keys()) == 0 and len(temp_chars) > 2:
                adjust_len = len(temp_chars) - 1
                test_chars = tuple(temp_chars)
                adjust_chars = combinations(tuple(test_chars), adjust_len)
                for char_group in adjust_chars:
                    temp_combo = first_let + tuple(char_group)
                    temp_combo = tuple(sorted(temp_combo))
                    if check_combo(temp_combo, test_item) is True:
                        test_combo = check_combo(temp_combo, test_item)
                        combo = temp_combo
            if test_combo is True:
                item_abrevs[combo] = test_item
                abrevs_list.append(combo)
                combo_found = True
                if ' ' not in test_item:
                    combo = combo + (' ',)
                    combo = tuple(sorted(combo))
                    test_combo_spc = check_combo(combo, test_item)
                    if test_combo_spc is True:
                        item_abrevs[combo] = test_item + ' '
                        abrevs_list.append(combo)
                        final_combo = combo
                sim_item_found = True
                break
            if len(temp_dict.keys()) == 0:
                sim_item_found = False
    return sim_item_found

In [5]:
def check_combo(combo, item):
    check = True
    difficult_combo = False
    letter_groups = []
    letter_positions = {}
    for let in combo:
        if usable_keys[let]['group'] in letter_groups:
            check = False
        group_num = usable_keys[let]['group']
        letter_groups.append(group_num)
        letter_positions[group_num] = (usable_keys[let]['row'], usable_keys[let]['column'])
    if len(combo) == 2:
        let1 = combo[0]
        let2 = combo[1]
        check_set1 = let1+let2
        check_set2 = let2+let1
        if check_set1 in shinglesk2 or check_set2 in shinglesk2:
            check = False
        elif combo in abrevs_list:
            check = False
        elif " " in combo:
            check = False
        elif 8 in letter_groups and 4 in letter_groups:
            if letter_positions[4][0] != letter_positions[8][0]:
                check = False
                difficult_combo = True
        elif 9 in letter_groups and 5 in letter_groups:
            if letter_positions[5][0] != letter_positions[9][0]:
                check = False
                difficult_combo = True
    elif len(combo) < 2:
        check = False
    elif len(combo) >= 3:
        if check is True:
            #ban combinations that stretch fingers too far
            if 8 in letter_groups:
                if 4 in letter_groups:
                    if 6 not in letter_groups:
                        #if (letter_positions[4][0] - letter_positions[8][0]) > 1:
                        if letter_positions[4][0] != letter_positions[8][0]:
                            check = False
                            difficult_combo = True
                if 6 in letter_groups:
                    if ((letter_positions[6][0] - letter_positions[8][0]) > 0 
                        or (letter_positions[6][0] - letter_positions[8][0]) < -1):
                        check = False
                        difficult_combo = True
            if 6 in letter_groups:
                if 4 in letter_groups:
                    if ((letter_positions[4][0] - letter_positions[6][0]) > 0 
                        or (letter_positions[4][0] - letter_positions[6][0]) < -1):
                        check = False
                        difficult_combo = True
            if 2 in letter_groups:
                if 4 in letter_groups:
                    p = letter_positions[2][0]
                    q = letter_positions[4][0]
                    #dist = math.dist(p, q)
                    #if dist > 1.5:
                    if p == 0 and q == 2:
                        check = False
                        difficult_combo = True
                if 6 in letter_groups:
                    p = letter_positions[2][0]
                    q = letter_positions[6][0]
                    #dist = math.dist(p, q)
                    #if dist > 2.1:
                    if p == 0 and q == 2:
                        check = False
                        difficult_combo = True
                
            if 9 in letter_groups:
                if 5 in letter_groups:
                    if 7 not in letter_groups:
                        #if (letter_positions[5][0] - letter_positions[9][0]) > 1:
                        if letter_positions[5][0] != letter_positions[5][0]:
                            check = False
                            difficult_combo = True
                if 7 in letter_groups:
                    if ((letter_positions[7][0] - letter_positions[9][0]) > 0 
                        or (letter_positions[7][0] - letter_positions[9][0]) < -1):
                        check = False
                        difficult_combo = True
            if 7 in letter_groups:
                if 5 in letter_groups:
                    if ((letter_positions[5][0] - letter_positions[7][0]) > 1 
                        or (letter_positions[5][0] - letter_positions[7][0]) < -1):
                        check = False
                        difficult_combo = True
            if 3 in letter_groups:
                if 5 in letter_groups:
                    p = letter_positions[3][0]
                    q = letter_positions[5][0]
                    #dist = math.dist(p, q)
                    #if dist > 1.5:
                    if p == 0 and q == 2:
                        check = False
                        difficult_combo = True
                if 7 in letter_groups:
                    p = letter_positions[3][0]
                    q = letter_positions[7][0]
                    #dist = math.dist(p, q)
                    #if dist > 2.1:
                    if p == 0 and q == 2:
                        check = False
                        difficult_combo = True
                        
        """if check is True:  
            #reserve 1 letter use for 2 letter words
            if len(combo) == 3 and ('KC_BSPC' in combo and ' ' in combo):
                if len(item) != 3:
                    check = False
                if item[-1] != ' ':
                    check = False
            if combo in abrevs_list:
                check = False"""
    if difficult_combo is True:
        item_abrevs[combo] = item
        abrevs_list.append(combo)
        if ' ' not in item:
            combo = combo + (' ',)
            combo = tuple(sorted(combo))
            if combo not in abrevs_list:
                item_abrevs[combo] = item + ' '
                abrevs_list.append(combo)
    return check

In [6]:
ngrams1 = {}
ngrams2 = {}
ngrams3 = {}
ngrams4 = {}
ngrams5 = {}
shingles = {}
shinglesk2 = set()
letter_ranks = {}
shingle_words = []

prefix_list = ['anti','de','dis','en','em','fore','in','im','il','ir','inter','mid','mis','mal','non','over','pre',
              're','semi','sub','super','trans','un','under']
suffix_list = ['able','ible','al','ial','ed','en','er','est','ful','ic','ing','ion','tion','ation','ition','ity',
              'ty','ive','ative','itive','less','ly','ment','ness','ous','eous','ious','es']

allow_space = False
#get_gram_dict('1grams_english.csv', ngrams1, False, 1)
get_gram_dict('data/enwiki-20150602-words-frequency-edit.csv', ngrams1, False, 1)
allow_space = True
get_gram_dict('data/2grams_english.csv', ngrams2, True, .009)
get_gram_dict('data/3grams_english.csv', ngrams3, True, .009)
get_gram_dict('data/4grams_english.csv', ngrams4, True, .009)
get_gram_dict('data/5grams_english.csv', ngrams5, True, .009)

allow_space = False
#for word, freq in ngrams1.items():
for word in shingle_words:
    freq = ngrams1[word]
    for let in word:
        let = let.lower()
        if let in letter_ranks:
            letter_ranks[let] += freq
        else:
            letter_ranks[let] = freq
    if len(word) > 3:
        word_len = len(word)
        for start in range(word_len-1):
            end = start + 2
            shingle = word[start:end].lower()
            shinglesk2.add(shingle)
            #allow approved 2 char suffix with space added
            if shingle in suffix_list:
                shingle = shingle + ' '
                if shingle in shingles:
                    if shingle in ngrams1.keys():
                        shingles[shingle] += freq * shingle_w
                    else:
                        shingles[shingle] += freq * shingle_w
                else:
                    if shingle in ngrams1.keys():
                        shingles[shingle] = freq * shingle_w
                    else:
                        shingles[shingle] = freq * shingle_w
        for k in range(3, word_len):
            shingle_count = word_len - k
            for start in range(shingle_count+1):
                end = start + k
                shingle = word[start:end].lower()
                if shingle[0] != ' ':
                    #limit to approved prefix/suffix lists
                    if shingle in prefix_list or shingle in suffix_list:
                        if shingle in shingles:
                            if shingle in ngrams1.keys():
                                shingles[shingle] += freq * shingle_w
                            else:
                                shingles[shingle] += freq * shingle_w
                        else:
                            if shingle in ngrams1.keys():
                                shingles[shingle] = freq * shingle_w
                            else:
                                shingles[shingle] = freq * shingle_w

candidate_dict = {**ngrams1, **ngrams2, **ngrams3, **ngrams4, **ngrams5}
for shingle, freq in shingles.items():
    if shingle in candidate_dict:
        candidate_dict[shingle] += freq
    else:
        candidate_dict[shingle] = freq

#candidate_scores = {}
candidate_scores = candidate_dict
#found that giving more points to long words isn't as good
"""for key, val in candidate_dict.items():
    if len(key) > 2:
        score = val * (len(key)-1)
        #score = val
        candidate_scores[key] = score
    #increase val for len 2
    elif len(key) == 2:
        score = val * (len(key) * 1)
        candidate_scores[key] = score"""

max_score = max(candidate_scores.values())
for key, val in candidate_scores.items():
    candidate_scores[key] = val / max_score

In [7]:
"""ngrams1f = {}
ngrams2f = {}
ngrams3f = {}
ngrams4f = {}
ngrams5f = {}
shingles = {}
#we can just use the first dataset to find letters that are never adjcent
#shinglesk2 = set()
letter_ranks = {}

allow_space = False
get_gram_dict('data/1grams_english-fiction.csv', ngrams1f, False, 1)
allow_space = True
get_gram_dict('data/2grams_english-fiction.csv', ngrams2f, True, 1)
get_gram_dict('data/3grams_english-fiction.csv', ngrams3f, True, 1)
get_gram_dict('data/4grams_english-fiction.csv', ngrams4f, True, 1)
get_gram_dict('data/5grams_english-fiction.csv', ngrams5f, True, 1)

allow_space = False
for word, freq in ngrams1f.items():
    for let in word:
        let = let.lower()
        if let in letter_ranks:
            letter_ranks[let] += freq
        else:
            letter_ranks[let] = freq
    if len(word) > 3:
        word_len = len(word)
        for k in range(3, word_len):
            shingle_count = word_len - k
            for start in range(shingle_count+1):
                end = start + k
                shingle = word[start:end].lower()
                if shingle[0] != ' ':
                    if shingle in shingles:
                        if shingle in ngrams1.keys():
                            #shingles[shingle] += freq
                            shingles[shingle] += freq * shingle_w
                        else:
                            shingles[shingle] += freq * shingle_w
                    else:
                        if shingle in ngrams1.keys():
                            #shingles[shingle] = freq
                            shingles[shingle] = freq * shingle_w
                        else:
                            shingles[shingle] = freq * shingle_w
      
                            
candidate_dict2 = {**ngrams1f, **ngrams2f, **ngrams3f, **ngrams4f, **ngrams5f}
for shingle, freq in shingles.items():
    if shingle in candidate_dict2:
        candidate_dict2[shingle] += freq
    else:
        candidate_dict2[shingle] = freq

candidate_scores2 = {}
for key, val in candidate_dict2.items():
    if len(key) > 2:
        #score = (len(key)-1) * val
        score = val
        candidate_scores2[key] = score
    #increase val for len 2
    elif len(key) == 2:
        score = (len(key) - 1) * val
        #score = (len(key)*.8) * val
        candidate_scores2[key] = score

max_score = max(candidate_scores2.values())
for key, val in candidate_scores2.items():
    if key not in candidate_scores.keys():
        candidate_scores[key] = val / max_score * 0.5
    else:
        candidate_scores[key] = (candidate_scores[key] * 0.5) + ((val / max_score) * 0.5)

for key in candidate_scores.keys():
    if key not in candidate_scores2.keys():
        candidate_scores[key] = candidate_scores[key] * 0.5"""

"ngrams1f = {}\nngrams2f = {}\nngrams3f = {}\nngrams4f = {}\nngrams5f = {}\nshingles = {}\n#we can just use the first dataset to find letters that are never adjcent\n#shinglesk2 = set()\nletter_ranks = {}\n\nallow_space = False\nget_gram_dict('1grams_english-fiction.csv', ngrams1f, False, 1)\nallow_space = True\nget_gram_dict('2grams_english-fiction.csv', ngrams2f, True, 1)\nget_gram_dict('3grams_english-fiction.csv', ngrams3f, True, 1)\nget_gram_dict('4grams_english-fiction.csv', ngrams4f, True, 1)\nget_gram_dict('5grams_english-fiction.csv', ngrams5f, True, 1)\n\nallow_space = False\nfor word, freq in ngrams1f.items():\n    for let in word:\n        let = let.lower()\n        if let in letter_ranks:\n            letter_ranks[let] += freq\n        else:\n            letter_ranks[let] = freq\n    if len(word) > 3:\n        word_len = len(word)\n        for k in range(3, word_len):\n            shingle_count = word_len - k\n            for start in range(shingle_count+1):\n             

In [8]:
print(len(candidate_scores))

106871


In [9]:
combo_found = False
import time
time_start = time.time()

#get items from forced user preferances
abrevs_list = []
best_key_dict = {}
for key in item_abrevs.keys():
    abrevs_list.append(key)

for item in item_add:
    candidate_scores[item] = item_add_weight
sorted_candidate_scores = sorted(candidate_scores.items(), key=lambda x:x[1], reverse=True)

#remove shingle that are lower ranks than the top word/longer shingle
temp_items = list()
temp_sorted_candidate_scores = sorted_candidate_scores.copy()
for full_item in temp_sorted_candidate_scores:
    item = full_item[0]
    if item in ngrams1.keys():
        continue
    item_found = False
    if len(temp_items) == 0:
        temp_items.append(item)
    elif len(item) > 2:
        for temp_item in temp_items:
            if item in temp_item:
                del sorted_candidate_scores[sorted_candidate_scores.index(full_item)]
                item_found = True
                break
        if item_found is False:
            temp_items.append(item)       
temp_sorted_candidate_scores = None
temp_items = None

abrevs_list = []
for key in item_abrevs.keys():
    abrevs_list.append(key)

letter_ranks[' '] = -99999999
#letter_ranks[special_key] = -99999999
letter_ranks[','] = -99999999
letter_ranks["'"] = -99999999
last_combo = 0
for items in sorted_candidate_scores:
    item = items[0]
    if len(item) == 2:
        item = item + ' '
    
    #similar_find = False
    #if len(abrevs_list) <= combo_list_limit:
        #check_hist = get_similar_item(item)

    #check if new item is just part of a higher scored existing item, ex. if we already have 'between' then ignore 'etween'
    used = False
    #remove items contained within a more frequent item
    for old_item in item_abrevs.values():
        if old_item == item:
            used = True             
    if len(item) > 2 and used is False:
        let_rank_dict = {}
        temp_dict = {}
        temp_groups = []
        f_letter_use = True
        combo_len = 7
        special_key_use = False
        char2_attempt = False
        f_letter = item[0].lower()
        if f_letter in usable_keys.keys():
            f_letter_group = usable_keys[f_letter]['group']
            temp_groups.append(f_letter_group)
            word_start = False
            for letter in item[1:]:
                letter = letter.lower()
                if letter in usable_keys.keys() and letter != f_letter:
                    letter_group = usable_keys[letter]['group']
                    if letter_group != f_letter_group:
                        if word_start is True:
                            let_rank_dict[letter] = -9999
                            if special_key is not None and special_key_use is False:
                                #let_rank_dict[special_key] = letter_ranks[special_key]
                                #combo_len += 1
                                special_key_use = True
                        if letter == ' ':
                            let_rank_dict[' '] = letter_ranks[letter]
                            word_start = True
                        else:
                            word_start = False
                            let_rank_dict[letter] = letter_ranks[letter]
            if len(let_rank_dict) < 8:
                combo_len = len(let_rank_dict)
        combo_found = False
        first_attept = True
            
        if len(let_rank_dict.keys()) == 0:
            combo_found = True
        while combo_found is False:
            if 50000 <= len(abrevs_list) <= 50010:
                print('abrevs_list 50000')
            if 500000 <= len(abrevs_list) <=  500010:
                print('abrevs_list 500000')
            if len(abrevs_list) >= combo_list_limit:
                combo_found = True
                if last_combo < 5:
                    last_combo += 1
                    print('last item:', item)
            else:
                if len(temp_dict) == 0:
                    if combo_len == 1 and f_letter_use is True:
                        f_letter_use = False
                        combo_len = 7
                        if len(let_rank_dict) < 8:
                            combo_len = len(let_rank_dict)
                        if char2_attempt is True:
                            combo_len = 2
                    elif combo_len <= 1 and special_key_use is False and special_key is not None:
                        f_letter_use = True
                        special_key_use = True
                        combo_len = 7
                        if len(let_rank_dict) < 8:
                            combo_len = len(let_rank_dict)
                        if char2_attempt is True:
                            combo_len = 2
                    elif ' ' not in item and combo_len == 1 and char2_attempt is False:
                        char2_attempt = True
                        f_letter_use = True
                        special_key_use = False
                        combo_len = 2
                        item = item + ' '
                        let_rank_dict[' '] = letter_ranks[' ']
                    elif combo_len == 1 and f_letter_use is False:
                        combo_found = True
                        #print('no combo for: ', item)
                        break
                    if combo_len > 1 or first_attept is True or (special_key_use is True and combo_len == 1):
                        temp_combos = combinations(list(let_rank_dict.keys()), combo_len)
                        combo_len = combo_len - 1
                        first_attept = False
                        for combo in temp_combos:
                            let_scores = 0
                            for let_score in combo:
                                let_scores += let_rank_dict[let_score]
                            temp_dict[combo] = let_scores
                    else:
                        combo_found = True
                        break
                try:
                    combo = min(temp_dict, key=temp_dict.get)
                except:
                    print('failed:', f_letter, item, temp_dict, combo_len)
                    #print(len(temp_dict))
                    #print(item_abrevs)
                del temp_dict[combo]
                
                if f_letter_use is True:
                    combo = (f_letter, *combo)
                if special_key_use is True:
                    combo = (special_key, *combo)
                combo = tuple(sorted(combo))
                check_res = False
                if combo not in abrevs_list:
                    check_res = check_combo(combo, item)
                #if item == 'here':
                    #print('here attempted:', combo, check_res, special_key_use)
                if check_res is True:
                    item_abrevs[combo] = item
                    abrevs_list.append(combo)
                    combo_found = True
                    best_key_dict[item] = [temp_dict, combo_len, f_letter_use, special_key_use, char2_attempt, 
                                           f_letter, let_rank_dict]
                    if ' ' not in item:
                        combo = combo + (' ',)
                        combo = tuple(sorted(combo))
                        if combo not in abrevs_list:
                            item_abrevs[combo] = item + ' '
                            abrevs_list.append(combo)

#print(item_abrevs)
time_end = time.time()
time_run = time_end - time_start
print(time_run)

last item: worthington
last item: traverses
last item: midget
last item: meteorite
last item: walpole
1813.5567581653595


In [10]:
repeat_items = None
attempt_count = 10


while attempt_count > 0:
    attempt_count = attempt_count - 1
    best_key_dict_updt = {}
    for item, last_data in best_key_dict.items():
        combo_found = False
        temp_dict = last_data[0]
        combo_len = last_data[1]
        f_letter_use = last_data[2]
        special_key_use = last_data[3]
        char2_attempt = last_data[4]
        f_letter = last_data[5]
        let_rank_dict = last_data[6]
        while combo_found is False:
            #if len(abrevs_list) >= combo_list_limit:
                #combo_found = True
            #else:
            if len(temp_dict) == 0:
                if combo_len == 1 and f_letter_use is True:
                    f_letter_use = False
                    combo_len = 7
                    if len(let_rank_dict) < 8:
                        combo_len = len(let_rank_dict)
                    if char2_attempt is True:
                        combo_len = 2
                elif combo_len <= 1 and special_key_use is False and special_key is not None:
                    f_letter_use = True
                    special_key_use = True
                    combo_len = 7
                    if len(let_rank_dict) < 8:
                        combo_len = len(let_rank_dict)
                    if char2_attempt is True:
                        combo_len = 2
                elif ' ' not in item and combo_len == 1 and char2_attempt is False:
                    char2_attempt = True
                    f_letter_use = True
                    special_key_use = False
                    combo_len = 2
                    item = item + ' '
                    let_rank_dict[' '] = letter_ranks[' ']
                elif combo_len == 1 and f_letter_use is False:
                    combo_found = True
                    #print('no combo for: ', item)
                    break
                if combo_len > 1 or first_attept is True or (special_key_use is True and combo_len == 1):
                    temp_combos = combinations(list(let_rank_dict.keys()), combo_len)
                    combo_len = combo_len - 1
                    first_attept = False
                    for combo in temp_combos:
                        let_scores = 0
                        for let_score in combo:
                            let_scores += let_rank_dict[let_score]
                        temp_dict[combo] = let_scores
                else:
                    combo_found = True
                    break
            try:
                combo = min(temp_dict, key=temp_dict.get)
            except:
                print('failed:', f_letter, item, temp_dict, combo_len)
                #print(len(temp_dict))
                #print(item_abrevs)
            del temp_dict[combo]

            if f_letter_use is True:
                combo = (f_letter, *combo)
            if special_key_use is True:
                combo = (special_key, *combo)
            combo = tuple(sorted(combo))
            check_res = False
            if combo not in abrevs_list:
                check_res = check_combo(combo, item)
            #if item == 'here':
                #print('here attempted:', combo, check_res, special_key_use)
            if check_res is True:
                item_abrevs[combo] = item
                abrevs_list.append(combo)
                combo_found = True
                best_key_dict_updt[item] = [temp_dict, combo_len, f_letter_use, special_key_use, char2_attempt, 
                                            f_letter, let_rank_dict]
                if ' ' not in item:
                    combo = combo + (' ',)
                    combo = tuple(sorted(combo))
                    if combo not in abrevs_list:
                        item_abrevs[combo] = item + ' '
                        abrevs_list.append(combo)

    best_key_dict = best_key_dict_updt.copy()
    print('starting attempt:', attempt_count, '- new combos found:', len(best_key_dict.keys()))


#print(item_abrevs)
time_end = time.time()
time_run = time_end - time_start
print(time_run)

starting attempt: 9 - new combos found: 2838
starting attempt: 8 - new combos found: 1783
starting attempt: 7 - new combos found: 1259
starting attempt: 6 - new combos found: 874
starting attempt: 5 - new combos found: 658
starting attempt: 4 - new combos found: 503
starting attempt: 3 - new combos found: 412
starting attempt: 2 - new combos found: 324
starting attempt: 1 - new combos found: 268
starting attempt: 0 - new combos found: 230
4818.0340168476105


In [11]:
item_abrevs_copy = item_abrevs.copy()
for k, v in item_abrevs_copy.items():
    #if len(abrevs_list) >= combo_list_limit:
        #break
    if v[-1] != ' ':
        combo = k + (',',)
        combo = tuple(sorted(combo))
        item = v + '. '
        if check_combo(combo, item) is True:
            item_abrevs[combo] = item
            abrevs_list.append(combo)
        combo = k + ('.',)
        combo = tuple(sorted(combo))
        item = v + '. '
        if check_combo(combo, item) is True:
            item_abrevs[combo] = item
            abrevs_list.append(combo)
        combo = k + (';',)
        combo = tuple(sorted(combo))
        item = v + '; '
        if check_combo(combo, item) is True:
            item_abrevs[combo] = item
            abrevs_list.append(combo)

i = 0

with open(output_f, "w") as f:
    for key in abrevs_list:
        val = item_abrevs[key]
        i += 1
        combo_id = "0000000" + str(i)
        trim_len = len(combo_id) - 7
        combo_id = "CB" + combo_id[trim_len:]
        combo_keys = ""
        for key_entry in key:
            code = usable_keys[key_entry]['kc']
            combo_keys = combo_keys + code + ", "
        combo_keys = combo_keys[:-2]
        line = "SUBS(" + combo_id + ',  "' + val + '",  ' + combo_keys + ")"
        #print(line)
        f.write(line)
        f.write('\n')