# Introduction
This file creates the word2vec model for the papers.

In [1]:
from pymongo import MongoClient
import pymongo

import re

import nltk
from nltk.corpus import stopwords

import pandas as pd
import numpy as np

# gensim
from gensim import corpora, models, similarities, matutils
import gensim

# sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import plotly as py
import plotly.graph_objs as go
from plotly import offline
py.offline.init_notebook_mode(connected=True)

import colorlover as cl

import pickle

In [2]:
packages = (('Pymongo', pymongo), ('NLTK', nltk), ('Gensim', gensim),
           ('Regex', re),('Plotly',py))

for package in packages:
    print('{0} version: {1}'.format(package[0],package[1].__version__))
    
!Python -V

Pymongo version: 3.2.2
NLTK version: 3.2.5
Gensim version: 3.3.0
Regex version: 2.2.1
Plotly version: 2.2.3
Python 3.6.4 :: Anaconda custom (64-bit)


# Get Subset of Data From Mongo

In [3]:
client = MongoClient(port=12345) # this is the port set by the SSH tunnel
db = client.research_papers
db.collection_names()

['cs_papers', 'math_papers', 'stat_papers']

# Combining Individual Topic Articles

In [5]:
topic_of_interest = 'math'

databases = {'cs':db.cs_papers.find(), 'stat':db.stat_papers.find(), 'math':db.math_papers.find()}

# to pull all papers in the same area
stat_subj = re.compile(r'stat\.|math\.pr|math\.st')
math_subj = re.compile(r'math\.')
cs_subj = re.compile(r'cs\.')
topic_labels = {'cs':cs_subj, 'stat':stat_subj, 'math':math_subj}

all_papers = []

for topic in databases.keys():
    
    database = databases[topic]
    subset = 0
    
    current_paper = database

    for pape in current_paper:    

        subjects = pape['subject']
        
        in_topic = False
        topic = topic_labels[topic_of_interest]
        
        for subj in subjects:
            if re.search(topic, subj):
                in_topic = True
            
            
        if in_topic:
            article = pape['article']
            if article:
                if len(article) > 5000:
                    all_papers.append(pape)
                    subset+=1
                
print('There are a total of {0} articles.'.format(len(all_papers)))

There are a total of 1132 articles.


# Cleaning!!

The word cid it the top word in all classes and seems to stand for random mathematical things, like: fractions, matrices, not equal to, and the box at the end of a proof to signify it is the end of the proof.  Because of this I will take it out of the set.

Also ligatures appear a lot in the translation, the following set was created from the Wikipedea page on ligatures.

In [6]:
ligatures = {'Ꜳ':'AA', 'ꜳ':'aa', 'Æ':'AE', 'æ':'ae', 'Ꜵ':'AO', 'ꜵ':'ao',
            'Ꜷ':'AU', 'ꜷ':'au', 'Ꜹ':'AV', 'ꜹ':'av', 'Ꜻ':'AV', 'ꜻ':'av',
            'Ꜽ':'AY', 'ꜽ':'ay', '🙰':'et', 'ﬀ':'ff', 'ﬃ':'ffi', 'ﬄ':'ffl', 
            'ﬁ':'fi', 'ﬂ':'fl', 'Œ':'OE', 'œ':'oe', 'Ꝏ':'OO', 'ꝏ':'oo', 
            'ﬆ':'st', 'Ꜩ':'TZ', 'ꜩ':'tz', 'ᵫ':'ue', 'Ꝡ':'VY', 'ꝡ':'vy'}

for lig in ligatures:
    re_lig = re.compile(lig)

    for pape in all_papers:
        pape['article'] = re.sub(re_lig, ligatures[lig], pape['article'])

In [7]:
re_cid = re.compile('cid')
et_al_cid = re.compile('et al')

for pape in all_papers:
    pape['article'] = re.sub(re_cid, ' ', pape['article'])
    pape['article'] = re.sub(et_al_cid, '', pape['article'])

# Word2Vec

Computer Science: 11,743,082 words

Statistics: 7,794,464 words

Mathematics: 9,544,541 words

In [8]:
all_text = [pape['article'] for pape in all_papers]

In [9]:
total_word_count = 0
for text in all_text:
    total_word_count += len(text.split())

print('There are {0} words total.'.format(total_word_count))

There are 9544541 words total.


## Minor Clean Up

In [10]:
for index, text in enumerate(all_text):
    cv = re.compile('crossvalidation')
    quote = re.compile('”')
    
    all_text[index] = re.sub(cv,'cross validation',text)
    all_text[index] = re.sub(quote,'',text)

## Set Up

In [11]:
stoplist = stopwords.words('english')

texts = [[word for word in pape.split() if word not in stoplist] for pape in all_text]

In [14]:
def ngrams(input_list,n):
    '''
        Finds all n-groupings of word in text.  Returns
        a string with word1_word2_...
    '''
    ngrams = list(zip(*[input_list[i:] for i in range(n)]))
      
    return [*map('_'.join, ngrams)]

In [15]:
texts_trigrams = []
for text in texts:
    texts_trigrams.append(text + ngrams(text,2) + ngrams(text,3))

In [31]:
model = gensim.models.Word2Vec(texts_trigrams, size=100, window=10, min_count=30, workers=4)

2018-03-07 21:30:47,496 : INFO : collecting all words and their counts
2018-03-07 21:30:47,496 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-07 21:30:54,616 : INFO : collected 7289546 word types from a corpus of 18566082 raw words and 1132 sentences
2018-03-07 21:30:54,616 : INFO : Loading a fresh vocabulary
2018-03-07 21:30:56,940 : INFO : min_count=30 retains 32662 unique words (0% of original 7289546, drops 7256884)
2018-03-07 21:30:56,941 : INFO : min_count=30 leaves 7436304 word corpus (40% of original 18566082, drops 11129778)
2018-03-07 21:30:57,038 : INFO : deleting the raw counts dictionary of 7289546 items
2018-03-07 21:30:57,152 : INFO : sample=0.001 downsamples 29 most-common words
2018-03-07 21:30:57,153 : INFO : downsampling leaves estimated 6812659 word corpus (91.6% of prior 7436304)
2018-03-07 21:30:57,286 : INFO : estimated required memory for 32662 words and 100 dimensions: 42460600 bytes
2018-03-07 21:30:57,287 : INFO : resetting

In [12]:
if topic_of_interest == 'math':
    filename = 'Word2Vec_Math.pkl'
elif topic_of_interest == 'stat':
    filename = 'Word2Vec_Stats.pkl'
else:
    filename = 'Word2Vec_CS.pkl'
    
# with open(filename,'wb') as pickle_out:
#     pickle.dump(model, pickle_out)

with open(filename,'rb') as pickle_in:
    model = pickle.load(pickle_in)

In [13]:
words_wv = list(model.wv.vocab.items())
words_wv[:10]

[('institute', <gensim.models.keyedvectors.Vocab at 0x1a441062e8>),
 ('technology', <gensim.models.keyedvectors.Vocab at 0x1a44106be0>),
 ('cambridge', <gensim.models.keyedvectors.Vocab at 0x1a44106dd8>),
 ('email', <gensim.models.keyedvectors.Vocab at 0x1a4410a5c0>),
 ('c', <gensim.models.keyedvectors.Vocab at 0x1a44108198>),
 ('e', <gensim.models.keyedvectors.Vocab at 0x1a44108080>),
 ('n', <gensim.models.keyedvectors.Vocab at 0x1a4410e668>),
 ('h', <gensim.models.keyedvectors.Vocab at 0x1a4410ceb8>),
 ('v', <gensim.models.keyedvectors.Vocab at 0x1a4410c0b8>),
 ('x', <gensim.models.keyedvectors.Vocab at 0x1a4410ce48>)]

In [14]:
interesting_words = ['ab_testing', 'accuracy', 'activation_function', 'adaboost', 'adaptive_boosting', 'adjusted', 
                     'agglomerative', 'aggregate', 'aggregates', 'aggregating', 'analysis_pca', 'analyst', 
                     'auc', 'average_linkage', 'backpropagation', 'bag_of_words', 'bag_words', 'bagging', 'batch', 
                     'bayesian', 'belief', 'bernoulli', 'bias', 'biased', 'big_data', 'binomial', 
                     'boost', 'bootstrap', 'brute_force', 'cart', 'classification', 'classifier', 'cluster', 'cod', 
                     'complete_linkage', 'complexity', 'component_analysis', 'computational', 'concavity', 
                     'conditional_distribution', 'confidence', 'confusion_matrix', 'consistency', 'constrained', 
                     'convex', 'convex_optimization', 'correlated', 'correlation', 'cosine_distance', 
                     'cosine_similarity', 'cost_function', 'coupling', 'cross_validate', 'cross_validation', 
                     'curse_dimensionality', 'curse_of_dimensionality', 'data', 'data_engineering', 'data_mining', 
                     'data_processing', 'data_set', 'dataset', 'dbscan', 'decision_boundary', 'decision_tree', 
                     'deep_learning', 'derive', 'deterministic', 'dimensionality', 'dimensionality_reduction', 
                     'dirichlet', 'discrete', 'downsampled', 'dx', 'eigenvalue', 'eigenvector', 'elastic_net',
                     'engineer', 'ensemble', 'entropy', 'euclidean_distance', 'f1', 'f1_score', 'f_score', 
                     'factorization', 'fbeta_score', 'feature_extraction', 'feature_representation', 
                     'feature_selection', 'feature_space', 'feature_vector', 'features', 'feedforward', 'fold', 
                     'fpr', 'fscore', 'functionality', 'gaussian', 'gaussian_model', 'gaussians', 'general_model', 
                     'generalized_linear_model', 'generative', 'gradient', 'gradient_boosting', 'gradient_descent', 
                     'greedy', 'hidden_layer', 'hierarchical', 'hierarchical_agglomerative_clustering', 
                     'hierarchical_clustering', 'high_confidence', 'inertia', 'information_entropy', 
                     'interconnected', 'interpolation', 'interpretability', 'jaccard_distance', 'jacobian', 
                     'k_iterations', 'k_means', 'kernel', 'kmeans', 'knn', 'l_norm', 'l_regularization', 
                     'labeled_data', 'lambda', 'language_processing', 'lasso', 'latent_dirichlet_allocation', 
                     'lda', 'learning_algorithms', 'learning_rate', 'likelihood', 'likelihood_estimation', 
                     'linear_approximation', 'linear_combination', 'linear_model', 'linear_regression', 
                     'linear_term', 'linearly_dependent', 'linearly_independent', 'linkage', 'log_e', 
                     'log_likelihood', 'log_odd', 'logarithm', 'logistic_regression', 'logit', 'loss_function', 
                     'machine_learning', 'manhattan_distance', 'markov_chain', 'mathematical_model', 'matrix', 
                     'matrix_factorization', 'maximum_likelihood', 'mean', 'mean_squared_error', 'measure', 'metric', 
                     'mini_batch', 'minibatch', 'minimization', 'missing_data', 'mle', 'model', 'model_based', 
                     'model_complexity', 'model_predictive', 'model_training', 'modeling', 'monte_carlo', 
                     'multiple_correspondence_analysis', 'n_gram', 'naive_bayes', 'natural_language_processing', 
                     'nearest_neighbor', 'nearestneighbor', 'negative_loglikelihood', 'neighborhood', 
                     'neural_network', 'ngram', 'nlp', 'node', 'nonlinear', 'nonlinearity', 'norm', 
                     'normalization', 'normalize', 'ols', 'optimal_solutions', 'optimization', 
                     'ordinary_least_squares', 'orthogonal', 'outlier', 'parameter_estimation', 'parameterization', 
                     'parametric_model', 'pattern_recognition', 'patterns', 'pca', 'performance_metric', 
                     'pipeline', 'poisson', 'polynomial_regression', 'pooling', 'posterior', 'precision', 
                     'prediction', 'predictive', 'predictor', 'principal_component', 
                     'principal_component_analysis', 'prior', 'apriori', 'probability_distribution', 'process', 
                     'projected', 'proof', 'prune', 'python', 'random_forest', 'random_variable', 'recall', 
                     'receiver_operating_characteristic', 'regression_model', 'regularization', 
                     'regularization_parameter', 'ridge_regression', 'roc', 'sampling', 'semisupervised', 
                     'sensitivity', 'sigmoid', 'simple_linear', 'simulation_results', 'single_linkage', 
                     'singular_value', 'singular_value_decomposition', 'skewed', 'space', 'sparse', 
                     'specificity', 'spectral_clustering', 'stack', 'statistical_model', 'stemming', 'step', 
                     'step_size', 'stochastic_gradient', 'stochastic_gradient_descent', 'stop_word', 
                     'stopping_criterion', 'subsampling', 'sufficient', 'supervised_learning', 'support_vector', 
                     'support_vector_machines', 'svd', 'svm', 'target', 'tend', 'test_data', 'threshold', 
                     'time_series', 'time_step', 'tokenization', 'tokenize', 'topic_modeling', 'total_variation', 
                     'tpr', 'train', 'trained_model', 'trained_models', 'training', 'training_data', 
                     'training_model', 'training_validation', 'trajectory', 'tree', 'treelike', 'tsne', 'unbiased', 
                     'uncorrelated', 'uniformly', 'unsupervised', 'unsupervised_learning', 'upsampling', 
                     'validate', 'validation', 'variance_reduction', 'vector_regression', 'visualization', 'ward', 
                     'ward_linkage', 'weight_vector', 'word_embedding', 'wordvec', 'zero']

interesting_words = np.array(interesting_words)

characters = ['𝜞', '𝝏', '∍', '𝛐', '𝝑', 'ι', 'Ν', '𝝌', '𝚪', 'Υ', '𝜑', 'Π', '𝜅', '𝜸', 'ϑ', '𝝆', '⎿', '𝛊', '𝜂', 
              '𝛘', '𝛻', '𝝄', '𝝁', '𝝘', '𝜉', '𝛌', 'β', '𝝐', '𝜷', '≤', '𝜃', '𝜹', 'Λ', '−∞', '∀', '𝛝', 'ς', '𝛿', '𝛏', 
              '∀', '𝝍', 'Ω', '𝛛', '𝝙', '𝛄', '𝛚', '∈', '𝛞', '≪', '⇏', '𝛾', '𝝊', '⊕', 'ǫ', 'θ', ' ', '∞', 'ζ', '𝜌', 
              '∅', 'Γ', 'φ', '𝜿', '𝛽', '𝝕', '𝜚', '𝛟', '𝝓', 'ρ', 'λ', 'σ', '∪', '<', '𝛖', '𝛆', 'α', '𝛷', 'ο', 'χ', 
              '𝜊', '𝝈', '≩', '𝛍', '↔', '𝛡', 'η', 'Μ', '⟾', '⊤', 'wt', '𝜓', 'xi', '∴', '⊆', 'Η', '𝛃', 'Ζ', '𝜘', 
              '⤃', '𝛠', 'κ', 'ξ', 'ω', '≨', 'ℝ', '𝜕', 'Ξ', 'Ψ', '≈', 'τ', '𝛎', '𝛑', '𝝂', '𝜽', '𝛥', '⇔', '𝛇', '∬', 
              '⊃', '𝜒', '𝛂', '𝜙', '𝜼', 'δ', '𝜶', '𝛬', '>', '𝝒', '𝜈', '𝛈', '𝛹', 'Σ', 'ψ', '𝛋', '𝜏', 'Ι', '≡', 'μ', 
              'ν', 'Α', '𝝔', '𝝀', '𝜐', '⤇', '𝛔', '⊇', '⇒', '𝝃', '𝜻', '𝛜', '⊉', '⊈', '∉', '≃', 'st', 'Ρ', '𝛓', '𝜋', 
              '𝜆', 'Τ', '𝜇', '𝜀', 'µ', '𝜾', 'π', '∋', '⊂', '∫', 'Β', '⊅', 'Χ', '⊄', '𝜍', '∵', '𝜛', '±', '·', '𝛉', 
              '𝜁', '𝝎', '𝛙', '∮', '𝝉', 'γ', '𝛼', '∆', 'Δ', '𝜄', '𝛗', 'Ο', '∩', 'Κ', '𝜺', '𝜗', 'rm×n', '𝜎', 'Θ', 
              '∂', 'Φ', '∧', '𝜖', '𝛤', '𝝋', '𝝇', 'ε', 'Ε', 'υ', '∑', '𝛅', '𝛒', '≫', '𝝅', '𝛕' ]

characters = np.array(characters)

In [17]:
# look at all models and topics learned in class
model.wv.most_similar('cost_function', topn=10)

[('solve_problem', 0.9721465706825256),
 ('integrability_condition', 0.9716504812240601),
 ('rigid_body', 0.9677146673202515),
 ('prove_convergence', 0.9664644002914429),
 ('mean_variance', 0.9654554128646851),
 ('differential_operator', 0.965217113494873),
 ('cf_eg', 0.965144693851471),
 ('small_parameter', 0.9648418426513672),
 ('piecewise_constant', 0.964563250541687),
 ('point_process', 0.9642618894577026)]

In [18]:
model.wv.similarity('model', 'system')

0.5237479631530428

In [19]:
translate = ['𝜞', '𝝏', '∍', '𝛐', '𝝑', 'ι', 'Ν', '𝝌', '𝚪', 'Υ', '𝜑', 'Π', '𝜅', '𝜸', 'ϑ', '𝝆', '⎿', '𝛊', '𝜂', 
              '𝛘', '𝛻', '𝝄', '𝝁', '𝝘', '𝜉', '𝛌', 'β', '𝝐', '𝜷', '≤', '𝜃', '𝜹', 'Λ', '∀', '𝛝', 'ς', '𝛿', '𝛏', 
              '∀', '𝝍', 'Ω', '𝛛', '𝝙', '𝛄', '𝛚', '∈', '𝛞', '≪', '⇏', '𝛾', '𝝊', 'ǫ', 'θ', '∞', 'ζ', '𝜌', 
              '∅', 'Γ', 'φ', '𝜿', '𝛽', '𝝕', '𝜚', '𝛟', '𝝓', 'ρ', 'λ', 'σ', '∪', '<', '𝛖', '𝛆', 'α', '𝛷', 'ο', 'χ', 
              '𝜊', '𝝈', '≩', '𝛍', '↔', '𝛡', 'η', 'Μ', '⟾', '⊤' '𝜓', 'xi', '∴', '⊆', 'Η', '𝛃', 'Ζ', '𝜘', 
              '⤃', '𝛠', 'κ', 'ξ', 'ω', '≨', 'ℝ', '𝜕', 'Ξ', 'Ψ', '≈', 'τ', '𝛎', '𝛑', '𝝂', '𝜽', '𝛥', '⇔', '𝛇', '∬', 
              '⊃', '𝜒', '𝛂', '𝜙', '𝜼', 'δ', '𝜶', '𝛬', '>', '𝝒', '𝜈', '𝛈', '𝛹', 'Σ', 'ψ', '𝛋', '𝜏', 'Ι', '≡', 'μ', 
              'ν', 'Α', '𝝔', '𝝀', '𝜐', '⤇', '𝛔', '⊇', '⇒', '𝝃', '𝜻', '𝛜', '⊉', '⊈', '∉', '≃', 'st', 'Ρ', '𝛓', '𝜋', 
              '𝜆', 'Τ', '𝜇', '𝜀', 'µ', '𝜾', 'π', '∋', '⊂', '∫', 'Β', '⊅', 'Χ', '⊄', '𝜍', '∵', '𝜛', '±', '·', '𝛉', 
              '𝜁', '𝝎', '𝛙', '∮', '𝝉', 'γ', '𝛼', '∆', 'Δ', '𝜄', '𝛗', 'Ο', '∩', 'Κ', '𝜺', '𝜗', 'rm×n', '𝜎', 'Θ', 
              '∂', 'Φ', '∧', '𝜖', '𝛤', '𝝋', '𝝇', 'ε', 'Ε', 'υ', '∑', '𝛅', '𝛒', '≫', '𝝅', '𝛕' ]
for symbol in translate:
    if symbol in model.wv.vocab:
        similar = model.wv.most_similar(symbol, topn=10)
        
        print('The top 10 similar words to {0} are.'.format(symbol))
        print(pd.DataFrame(similar, columns = ['word','similarity score']))
        print()

The top 10 similar words to ι are.
           word  similarity score
0            ¯e          0.736334
1            ∼→          0.706884
2    involution          0.698789
3   epimorphism          0.696101
4      commutes          0.693671
5       functor          0.674511
6  monomorphism          0.666959
7     bialgebra          0.659829
8       sending          0.659221
9     bijection          0.653210

The top 10 similar words to ϑ are.
          word  similarity score
0       solves          0.621823
1           ∞ω          0.619247
2     fulfills          0.608248
3  subsolution          0.607115
4           uη          0.605175
5           ωt          0.603780
6           v·          0.600897
7           aψ          0.599313
8   fulfilling          0.596572
9           κt          0.593771

The top 10 similar words to β are.
  word  similarity score
0    α          0.670674
1   sβ          0.662106
2   hβ          0.637006
3   βc          0.633028
4   −β          0.630898
5   aβ

The top 10 similar words to · are.
           word  similarity score
0            o∆          0.540753
1       factors          0.487453
2       reduced          0.485936
3           nωx          0.476819
4     quasinorm          0.454436
5           rkv          0.450011
6    commutator          0.436760
7           −kx          0.436159
8            µg          0.431898
9  equivalently          0.431580

The top 10 similar words to γ are.
  word  similarity score
0   bγ          0.698846
1   γ′          0.670804
2   γ−          0.660261
3   hγ          0.645874
4   cγ          0.641119
5   pγ          0.627400
6   γγ          0.626574
7   fγ          0.626045
8   xγ          0.612566
9   gγ          0.605494

The top 10 similar words to ∆ are.
         word  similarity score
0          ∆−          0.576360
1          ∆′          0.557063
2  simplicial          0.497065
3          h∆          0.491847
4      kleene          0.489392
5          ∆k          0.485938
6       facet       

# PCA Plot

In [21]:
def pca_plot(model, word_set, title):
    labels = []
    tokens = []

    words_not_used = np.array([])
    
    for word in model.wv.vocab:
        if word in word_set:
            tokens.append(model[word])
            labels.append(word)
    
    words_not_used = np.setdiff1d(word_set,np.array(labels))
    
    
    pca = PCA(n_components=2) # Uses SVD
    new_values = pca.fit_transform(tokens)
    pca_ratio = pca.explained_variance_ratio_
    print(pca_ratio)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
        
    points = go.Scatter(
        x = x,
        y = y,
        mode = 'markers+text',
        text = labels,
        textposition = 'top'
    )

    layout = go.Layout(
        autosize=False,
        width=800,
        height=800,
        title=title,
        titlefont=dict(
            size=25
        ),
        xaxis=dict(
            showgrid=False,
            zeroline=True,
            showticklabels=False
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=True,
            showticklabels=False
        )
    )

    data = [points]
    fig = go.Figure(data=data, layout=layout)

    py.offline.iplot(fig)
    
    return words_not_used

In [22]:
words_not_used = pca_plot(model, interesting_words,'Word2Vec: Interesting Words')
print('Words not in interesting list not used:')
print(', '.join(words_not_used))

[0.20706032 0.08863876]



Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



Words not in interesting list not used:
ab_testing, activation_function, adaboost, adaptive_boosting, agglomerative, aggregate, aggregates, aggregating, analysis_pca, analyst, auc, average_linkage, backpropagation, bag_of_words, bag_words, bagging, belief, big_data, boost, brute_force, cart, classifier, cod, complete_linkage, component_analysis, confusion_matrix, cosine_distance, cosine_similarity, cross_validate, cross_validation, curse_dimensionality, curse_of_dimensionality, data_engineering, data_mining, data_processing, dbscan, decision_boundary, decision_tree, deep_learning, dimensionality_reduction, downsampled, elastic_net, engineer, f1, f1_score, f_score, fbeta_score, feature_extraction, feature_representation, feature_selection, feature_space, feature_vector, feedforward, fpr, fscore, functionality, gaussian_model, gaussians, generalized_linear_model, generative, gradient_boosting, gradient_descent, hidden_layer, hierarchical_agglomerative_clustering, hierarchical_clustering,

In [25]:
chars_not_used = pca_plot(model, characters,'Word2Vec: Characters')
print('Characters not used:')
print(', '.join(chars_not_used))

[0.08868276 0.06068073]



Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



Characters not used:
 , <, >, Α, Β, Γ, Δ, Ε, Ζ, Η, Θ, Ι, Κ, Λ, Μ, Ν, Ξ, Ο, Π, Ρ, Σ, Τ, Υ, Φ, Χ, Ψ, Ω, μ, ο, ℝ, ⇏, ∉, ∍, ∫, ∬, ∮, ∴, ∵, ≨, ≩, ⊄, ⊅, ⊈, ⊉, ⊤, ⎿, ⟾, ⤃, ⤇, 𝚪, 𝛂, 𝛃, 𝛄, 𝛅, 𝛆, 𝛇, 𝛈, 𝛉, 𝛊, 𝛋, 𝛌, 𝛍, 𝛎, 𝛏, 𝛐, 𝛑, 𝛒, 𝛓, 𝛔, 𝛕, 𝛖, 𝛗, 𝛘, 𝛙, 𝛚, 𝛛, 𝛜, 𝛝, 𝛞, 𝛟, 𝛠, 𝛡, 𝛤, 𝛥, 𝛬, 𝛷, 𝛹, 𝛻, 𝛼, 𝛽, 𝛾, 𝛿, 𝜀, 𝜁, 𝜂, 𝜃, 𝜄, 𝜅, 𝜆, 𝜇, 𝜈, 𝜉, 𝜊, 𝜋, 𝜌, 𝜍, 𝜎, 𝜏, 𝜐, 𝜑, 𝜒, 𝜓, 𝜕, 𝜖, 𝜗, 𝜘, 𝜙, 𝜚, 𝜛, 𝜞, 𝜶, 𝜷, 𝜸, 𝜹, 𝜺, 𝜻, 𝜼, 𝜽, 𝜾, 𝜿, 𝝀, 𝝁, 𝝂, 𝝃, 𝝄, 𝝅, 𝝆, 𝝇, 𝝈, 𝝉, 𝝊, 𝝋, 𝝌, 𝝍, 𝝎, 𝝏, 𝝐, 𝝑, 𝝒, 𝝓, 𝝔, 𝝕, 𝝘, 𝝙
