In [1]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from collections import Counter
from scipy.sparse import csr_matrix

In [2]:
with open('dict.txt', 'r') as file:
    dictionary = file.read()

In [3]:
def count_vowel_groups(text):
    return sum(map(lambda w: any(map(lambda c: c in 'aeiouy', w)), text.lower().split()))

In [4]:
def get_windows(sequence, window_sizes=(1, 2)):
    result = []
    for window_size in range(window_sizes[0], window_sizes[-1] + 1):
        for i in range(len(sequence) - (window_size - 1)):
            result.append(sequence[i:i + window_size])
    return result

In [5]:
def to_one_hot(word, mapping):
    result = [0] * len(mapping)
    for window in get_windows(word.lower()):
        if window in mapping:
            result[mapping[window]] += 1
    return result

In [6]:
def get_feature_mapping(train_words):
    features = set()
    for word in train_words:
        for window in get_windows(word.lower()):
            features.add(window)
    mapping = {f: i for i, f in enumerate(sorted(features))}
    return mapping

In [7]:
def get_words_and_syllables(dictionary):
    words = []
    syllables = []
    for line in dictionary.split('\n'):
        word, pronunciation = line.split('\t')
        word = word.split('(')[0]
        syllable_count = count_vowel_groups(pronunciation)
        words.append(word)
        syllables.append(syllable_count)
    return words, syllables

In [8]:
words, syllables = get_words_and_syllables(dictionary=dictionary)

In [9]:
train_words, test_words, train_counts, test_counts = train_test_split(words, syllables, test_size=0.01, shuffle=True)

In [10]:
mapping = get_feature_mapping(train_words=train_words)

In [11]:
X_train = csr_matrix([to_one_hot(text, mapping) for text in train_words])
X_test = csr_matrix([to_one_hot(text, mapping) for text in test_words])

y_train = train_counts
y_test = test_counts

In [12]:
clf = MLPRegressor(hidden_layer_sizes=(10,), verbose=2)
clf.fit(X_train, y_train)

Iteration 1, loss = 0.21743553
Iteration 2, loss = 0.07597458
Iteration 3, loss = 0.06243816
Iteration 4, loss = 0.05738227
Iteration 5, loss = 0.05492322
Iteration 6, loss = 0.05350545
Iteration 7, loss = 0.05238199
Iteration 8, loss = 0.05163323
Iteration 9, loss = 0.05093290
Iteration 10, loss = 0.05049456
Iteration 11, loss = 0.04991418
Iteration 12, loss = 0.04935549
Iteration 13, loss = 0.04896825
Iteration 14, loss = 0.04848493
Iteration 15, loss = 0.04805448
Iteration 16, loss = 0.04762846
Iteration 17, loss = 0.04721635
Iteration 18, loss = 0.04682616
Iteration 19, loss = 0.04650857
Iteration 20, loss = 0.04615163
Iteration 21, loss = 0.04596195
Iteration 22, loss = 0.04567895
Iteration 23, loss = 0.04538243
Iteration 24, loss = 0.04522555
Iteration 25, loss = 0.04503857
Iteration 26, loss = 0.04474662
Iteration 27, loss = 0.04455790
Iteration 28, loss = 0.04449934
Iteration 29, loss = 0.04425565
Iteration 30, loss = 0.04411712
Iteration 31, loss = 0.04390123
Iteration 32, los

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=2, warm_start=False)

In [13]:
raw_pred = clf.predict(X_test)
pred = np.round(raw_pred)
print(accuracy_score(y_true=y_test, y_pred=pred))
print(mean_squared_error(y_true=y_test, y_pred=raw_pred))
print(mean_absolute_error(y_true=y_test, y_pred=raw_pred))
print(classification_report(y_true=y_test, y_pred=pred))
print(confusion_matrix(y_true=y_test, y_pred=pred))
print(abs(np.sum(y_test) - np.sum(pred)) / np.sum(pred))

0.8903080390683696
0.10314131582729456
0.19365153167673282
             precision    recall  f1-score   support

        0.0       0.00      0.00      0.00         0
        1.0       0.92      0.94      0.93       165
        2.0       0.93      0.93      0.93       624
        3.0       0.87      0.84      0.86       357
        4.0       0.79      0.83      0.81       129
        5.0       0.73      0.78      0.75        45
        6.0       0.75      0.60      0.67        10
        7.0       0.00      0.00      0.00         1

avg / total       0.89      0.89      0.89      1331

[[  0   0   0   0   0   0   0   0]
 [  1 155   8   1   0   0   0   0]
 [  0  11 582  31   0   0   0   0]
 [  0   1  35 300  21   0   0   0]
 [  0   0   1  12 107   9   0   0]
 [  0   1   0   0   8  35   1   0]
 [  0   0   0   0   0   4   6   0]
 [  0   0   0   0   0   0   1   0]]
0.00243605359317905


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [14]:
from sklearn_porter import Porter
porter = Porter(clf, language='js')
output = porter.export(embed_data=True)
print(len(output))
with open('syllable_counter.js', 'w') as out:
    out.write(output)

196840


In [15]:
word = None
while word != 'q':
    word = input('Enter a word:')
    guess = clf.predict([to_one_hot(word, mapping)])[0]
    print('{} has about {} syllables'.format(word, int(np.round(guess))))
    print('(guess: {:.2f})'.format(guess))

Enter a word:supercalifragilisticexpialidocious
supercalifragilisticexpialidocious has about 14 syllables
(guess: 14.19)
Enter a word:floccinaucinihilipilification
floccinaucinihilipilification has about 12 syllables
(guess: 12.01)
Enter a word:umonoultramicroscopicsilicovolcanoconiosis
umonoultramicroscopicsilicovolcanoconiosis has about 18 syllables
(guess: 17.82)
Enter a word:forty-two
forty-two has about 3 syllables
(guess: 3.18)
Enter a word:q
q has about 1 syllables
(guess: 1.01)
