<div class="alert alert-block alert-success">

# Plaut_Model/code/plaut_all_combinations

### Purpose
To create all possible combinations based on the Plaut grapheme onsets, vowels, and codas

### Date Created
March 27, 2020
***
### Revisions
* Mar 27, 2020 - File created with initial code


</div>

In [1]:
# import required libraries
import itertools
import numpy as np
import pandas as pd
import time

In [2]:
# Mappings for Graphemes
grapheme_onset = ['Y', 'S', 'P', 'T', 'K', 'Q', 'C', 'B', 'D', 'G', 'F', 'V', 'J', 'Z',
                  'L', 'M', 'N', 'R', 'W', 'H', 'CH', 'GH', 'GN', 'PH', 'PS', 'RH', 'SH', 'TH', 'TS', 'WH']
grapheme_vowel = ['E', 'I', 'O', 'U', 'A', 'Y', 'AI', 'AU', 'AW', 'AY', 'EA', 'EE', 'EI',
                  'EU', 'EW', 'EY', 'IE', 'OA', 'OE', 'OI', 'OO', 'OU', 'OW', 'OY', 'UE', 'UI', 'UY']
grapheme_codas = ['H', 'R', 'L', 'M', 'N', 'B', 'D', 'G', 'C', 'X', 'F', 'V', 'J', 'S', 'Z', 'P', 'T', 'K', 'Q', 'BB', 'CH', 'CK', 'DD', 'DG',
                  'FF', 'GG', 'GH', 'GN', 'KS', 'LL', 'NG', 'NN', 'PH', 'PP', 'PS', 'RR', 'SH', 'SL', 'SS', 'TCH', 'TH', 'TS', 'TT', 'ZZ', 'U', 'E', 'ES', 'ED']

In [3]:
# initialize empty lists
orths = []
graphemes = []
types = []

# find graphemes and types for every possible combination
for i in itertools.product(grapheme_onset, grapheme_vowel, grapheme_codas):
    orths.append("".join(i))
    graphemes.append(i)
    types.append('CMPLX' if (len(i[0]) > 1 or len(i[1]) > 1 or len(i[2])> 1) else 'SIMP')

# use numpy array for easier indexing
graphemes = np.array(graphemes)

# create dataset
dataset = pd.DataFrame(data={'orth': orths,
                            'g_onset': graphemes[:, 0],
                            'g_vowel': graphemes[:, 1],
                            'g_coda': graphemes[:, 2],
                            'type': types,
                            'freq': np.full(len(orths), 2),
                            'log_freq': np.full(len(orths), np.log(2))})

In [4]:
# initialize empty list
grapheme_codes = []

# define grapheme sets, and word grapheme sets
grapheme_sets = [grapheme_onset, grapheme_vowel, grapheme_codas]
word_graphemes = [dataset['g_onset'], dataset['g_vowel'], dataset['g_coda']]

'''
for each grapheme type:
  - create an empty matrix
  
  - for each grapheme inside that grapheme type
      > find the indexes of the words that have that graphemex
      > flag it in the empty matrix (i.e. set as 1)
      
      > if it is a complex grapheme (more than one letter)
         > repeat above process for the individual letters of the grapheme
           if the individual letter is also a grapheme of that type
  - add the completed matrix to the list
concatenate to obtain matrix of grapheme vectors of all words
'''
for g_set, wg in zip(grapheme_sets, word_graphemes):
    grapheme_code = np.zeros((len(wg), len(g_set)), dtype=int)
    
    for g_index, grapheme in zip(range(len(g_set)), g_set):
        w_index = np.argwhere(wg == grapheme)
        grapheme_code[w_index, g_index] = 1
        
        if len(grapheme_onset) > 1:
            for sg in grapheme:
                try:
                    sg_index = g_set.index(sg)
                    grapheme_code[w_index, sg_index] = 1
                except:
                    pass
                
    grapheme_codes.append(grapheme_code)
grapheme_codes = np.hstack(grapheme_codes)

  return bound(*args, **kwds)


In [5]:
# add grapheme vectors to dataset
dataset['graphemes'] = list(grapheme_codes)

In [6]:
# save dataset to csv file
dataset.to_csv(r'../dataset/all_comb/plaut_all_comb_base.csv', index_label='word_id')