In [22]:
import pandas as pd
import numpy as np

import seaborn as sns; sns.set(context='talk')
from matplotlib import pyplot as plt, style; style.use('fivethirtyeight')

import scattertext as st
import spacy

nlp = spacy.load('en')

In [24]:
df.head()

Unnamed: 0,name,primary_position,report,label,pos,parse
0,**Luke Heimlich,LHP,Heimlich is a Level 1 sex offender and wouldn'...,MiLB,SP,"(heimlich, is, a, level, 1, sex, offender, and..."
1,**Noah Song,RHP,The way teams value Song depends on whether or...,MiLB,SP,"(the, way, teams, value, song, depends, on, wh..."
2,A.J. Cole,RHP,"The Nationals have acquired Cole twice, first...",MLB,SP,"(the, nationals, have, acquired, cole, twice, ..."
3,A.J. Cole,RHP,Signed for an above-slot $2 million as a Natio...,MLB,SP,"(signed, for, an, above, -, slot, $, 2, millio..."
4,A.J. Cole,RHP,"It often takes time for those high-ceilinged,...",MLB,SP,"(it, often, takes, time, for, those, high, -, ..."


In [25]:
def fix_pos(x):
    if x in ['LF', 'CF', 'RF']:
        return 'OF'
    elif x == 'INF':
        return 'UTIL'
    elif x == 'DH':
        return '1B'
    
    return x


df = pd.read_csv('https://jacobdanovitch.blob.core.windows.net/datasets/twtc.csv', # 'https://github.com/jacobdanovitch/jdnlp/blob/master/datasets/twtc/twtc.csv?raw=true',
                usecols = ['name', 'primary_position', 'report', 'label', 'text'])
df = df[df.label != -1].reset_index(drop=True)


df['primary_position'] = df['primary_position'].apply(fix_pos)
df['label'] = df['label'].apply(lambda x: 'MLB' if x else 'MiLB')
df['pos'] = df['primary_position'].apply(lambda x: 'SP' if x.endswith('HP') else 'BAT')
df['parse'] = df['report'].apply(st.whitespace_nlp_with_sentences)
df['parse_processed'] = df['text'].apply(st.whitespace_nlp_with_sentences)

print(df.shape)
df.sample(1)

(7778, 8)


Unnamed: 0,name,primary_position,report,label,text,pos,parse,parse_processed
4672,Kyle Tucker,OF,Houston had two of the first five picks in the...,MLB,PERSON had two of the first five picks in the ...,BAT,"(houston, had, two, of, the, first, five, pick...","(person, had, two, of, the, first, five, picks..."


In [37]:
"""
corpus = st.CorpusFromPandas(text_df, 
                             category_col='label', 
                             text_col='report',
                             nlp=nlp).build()
"""

corpus = st.CorpusFromParsedDocuments(df, category_col='label', parsed_col='parse').build()

## Corpus Viz

In [31]:
def corpus_explorer_html(corpus, category, not_category_name, filename, category_name=None, max_terms=100):
    category_name = category_name or category
    html = st.produce_scattertext_html(corpus,
                                       category=category,
                                       category_name=category_name,
                                       not_category_name=not_category_name,
                                       width_in_pixels=1000,
                                       protocol='https',
                                       pmi_threshold_coefficient=8,
                                       minimum_term_frequency=20,
                                       filter_unigrams=True,
                                       max_terms=max_terms
                                      )

    open(filename, 'wb').write(html.encode('utf-8'))

### Label Viz

#### Unprocessed

In [38]:
corpus_explorer_html(corpus, 'MLB', 'MiLB', 'assets/label-viz.html', max_terms=1000)

In [39]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['MLB Score'] = corpus.get_scaled_f_scores('MLB', scaler_algo='percentile')

term_freq_df.to_csv('assets/label_term_freqs.csv')
term_freq_df.nlargest(10, 'MLB Score')

Unnamed: 0_level_0,MiLB freq,MLB freq,MLB Score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alford,0,47,1.0
nix,0,43,0.999147
cecchini,0,38,0.997598
robles,0,35,0.996513
banda,0,34,0.996077
fried,0,31,0.994667
arroyo,0,30,0.994142
ciuffo,0,30,0.994142
tellez,0,29,0.99358
grisham,0,29,0.99358


#### Preprocessed

In [54]:
corpus = st.CorpusFromParsedDocuments(df, category_col='label', parsed_col='parse_processed') \
                                    .build() \
                                    .remove_terms([
    'reid', 
    'foley', 
    'reid foley', 
    'pleskoff', 
    'debut', 
    'major', 
    'league',
    'major league',
    'his major',
    'his big',
    'organization debut',
    'major organization',
    'made his',
    'league debut',
    'the 2012',
    'organization 2011'
])

In [55]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['MLB Score'] = corpus.get_scaled_f_scores('MLB', scaler_algo='percentile')

term_freq_df.to_csv('assets/processed_label_term_freqs.csv')
term_freq_df.nlargest(10, 'MLB Score')

Unnamed: 0_level_0,MiLB freq,MLB freq,MLB Score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
trade,210,218,1.0
the package,20,42,0.999735
at age,144,167,0.999512
as part,87,105,0.99756
traded,116,118,0.997343
youngest,54,78,0.997062
three team,17,35,0.997009
that sent,39,61,0.996948
organization deal,34,56,0.996236
age 20,30,53,0.996151


In [56]:
corpus_explorer_html(corpus, 'MLB', 'MiLB', 'assets/processed-label-viz.html', max_terms=1000)

### Topic Modelling

In [57]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline


unigram_corpus = (corpus.get_stoplisted_unigram_corpus())

topic_model = st.SentencesForTopicModeling(
    unigram_corpus).get_topics_from_model(Pipeline([
        ('tfidf', TfidfTransformer(sublinear_tf=True)),
        ('nmf', (NMF(n_components=50, alpha=.1, l1_ratio=.5, random_state=0)))
    ]),
                                          num_terms_per_topic=10)

topic_feature_builder = st.FeatsFromTopicModel(topic_model)

topic_corpus = st.CorpusFromParsedDocuments(
    df,
    category_col='label',
    parsed_col='parse_processed',
    feats_from_spacy_doc=topic_feature_builder).build()

In [58]:
html = st.produce_scattertext_explorer(
    topic_corpus,
    category='MLB',
    category_name='MLB',
    not_category_name='MiLB',
    width_in_pixels=1000,
    metadata=df['label'],
    use_non_text_features=True,
    use_full_doc=True,
    pmi_threshold_coefficient=0,
    topic_model_term_lists=topic_feature_builder.get_top_model_term_lists(),
    topic_model_preview_size=20)

open('assets/topic_model.html', 'wb').write(html.encode('utf-8'))

11726793

### Position Viz

In [59]:
pos_corpus = corpus.recategorize(df['pos'])
corpus_explorer_html(pos_corpus, 'BAT', 'Pitcher', 'assets/pos-viz.html', category_name='Hitter')

## Embedding Viz

In [60]:
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE

emb_label_corpus = corpus.get_stoplisted_unigram_corpus()
emb_pos_corpus = pos_corpus.get_stoplisted_unigram_corpus()

In [61]:
html = st.produce_projection_explorer(emb_label_corpus,
                                      word2vec_model=Word2Vec(size=100,
                                                              window=5,
                                                              min_count=20,
                                                              workers=4),
                                      projection_model=TSNE(),
                                      category='MLB',
                                      category_name='MLB',
                                      not_category_name='MiLB',
                                      metadata=df['label'],
                                      width_in_pixels=1000,
                                      protocol='https',
                                      pmi_threshold_coefficient=8,
                                      filter_unigrams=True,
                                      max_terms=100
                                     )

open('assets/embedding-label-viz.html', 'wb').write(html.encode('utf-8'))

7926310

In [62]:
html = st.produce_projection_explorer(emb_pos_corpus,
                                      word2vec_model=Word2Vec(size=100,
                                                              window=5,
                                                              min_count=20,
                                                              workers=4),
                                      projection_model=TSNE(),
                                      category='BAT',
                                      category_name='Hitter',
                                      not_category_name='Pitcher',
                                      metadata=df['pos'],
                                      width_in_pixels=1000,
                                      protocol='https',
                                      pmi_threshold_coefficient=8,
                                      filter_unigrams=True,
                                      max_terms=100
                                     )

open('assets/embedding-pos-viz.html', 'wb').write(html.encode('utf-8'))

7916561

In [81]:
from allennlp.nn import util
from allennlp.data.fields import TextField, ListField
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Tokenizer, WordTokenizer, Token
from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.vocabulary import Vocabulary

In [68]:
report = df.sample(1).report.values[0]
report

"  A native of the Dominican Republic, Severino made an impressive full-season debut in 2013 at Class A Hagerstown. He continued that success the next season with Class A Advanced Potomac and in the Arizona Fall League. Severino's arm strength and athleticism behind the plate leave no doubt he has the skills necessary to catch in the big leagues. He blocks balls in the dirt well, already shows an aptitude for pitch framing and is improving as a game-caller.Severino made progress offensively in 2014, but his bat remains well behind his glove. He has some pop and his easy swing gives him a chance to eventually hit for more average. As he moves up to the upper levels of the Minor Leagues, his advanced defensive ability will allow the Nationals to be patient and allow his offense further develop."

In [82]:
tokenizer = WordTokenizer()
splitter = SpacySentenceSplitter()
token_indexers = {"tokens": SingleIdTokenIndexer()}
vocab = Vocabulary()

In [97]:
sentences = splitter.split_sentences(report)
tokenized_sents = [[Token(w) for w in tokenizer.tokenize(sent)] for sent in sentences]

sent_fields = ListField([TextField(s, token_indexers) for s in tokenized_sents])
sent_fields

<allennlp.data.fields.list_field.ListField at 0x1b6a8ada0>

In [98]:
inst = Instance({'tokens': sent_fields})
#vocab.index(inst)

inst.fields['tokens'].index(vocab)
tokens = inst.as_tensor_dict()['tokens']

tokens

{'tokens': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
          0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1]])}

In [107]:
util.get_text_field_mask(tokens, num_wrapping_dims=1) #== tokens['tokens']

ValueError: Expected a tensor with dimension 2 or 3, found 1

In [148]:
df.sample(10)[['text', 'label']].to_json('ai2test.json', orient='records', lines=True)

In [149]:
from allennlp.data.dataset_readers.dataset_reader import DatasetReader

jsr = DatasetReader.by_name("text_classification_json")(segment_sentences=True)
train = jsr.read('ai2test.json')

10it [00:00, 11.86it/s]


In [180]:
#tf = vars(train[0].fields['tokens'])['field_list'][0]
#tf.as_tensor()
inst = train[1].fields['tokens']
inst.index(vocab)
sents = inst.as_tensor(inst.get_padding_lengths())
sents

{'tokens': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 0, 0, 0, 0, 0]])}

In [197]:
import torch
x = {'tokens': torch.stack([sents['tokens'], sents['tokens']], dim=0)}
util.get_text_field_mask(x)#, num_wrapping_dims=1)

tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])

In [200]:
#(util.get_text_field_mask(x).sum(dim=1) > 1).byte()
util.get_text_field_mask({'_': util.get_text_field_mask(x)}).eq(util.get_text_field_mask(x)).all()

tensor(True)

In [232]:
docs = torch.randn(2, 8, 25)
w = torch.randn(2, 8)


docs.size(), w.size()

(torch.Size([2, 8, 25]), torch.Size([2, 8]))

In [265]:
torch.randn(2, 1, 8, 53) * torch.randn(2, 8, 53, 25)

RuntimeError: The size of tensor a (53) must match the size of tensor b (25) at non-singleton dimension 3

In [236]:
# torch.matmul(docs, w.T).size()
# docs * w.unsqueeze(-1)

matrix = torch.randn(1, 9, 25)

matrix.bmm(vector.unsqueeze(-1)).squeeze(-1)

tensor([[[ 8.8707e-01, -1.2449e-02,  2.3112e-01,  1.2071e-02,  2.8455e-01,
          -4.6274e-02,  5.4149e-01,  7.8635e-03, -2.6913e-01, -3.7698e-01,
           1.2731e+00,  4.1280e-01, -1.0382e+00, -1.3867e+00, -1.7997e+00,
          -1.2317e+00, -7.0544e-01, -2.2487e-01, -8.3829e-01,  4.5262e-01,
          -3.7627e-01, -1.1786e-01, -8.9341e-02, -4.8377e-01,  6.2320e-01],
         [ 5.4593e-01,  1.5988e-02,  1.3876e-01,  3.7775e-01, -6.5739e-02,
           3.0363e-01,  8.8584e-01, -3.0559e-01, -4.9869e-01,  9.3277e-01,
           6.1826e-01, -5.6569e-01,  2.5759e-01,  3.2612e-02,  6.7454e-01,
          -8.7706e-01,  1.2436e+00,  7.8087e-01,  4.2654e-02, -5.7773e-01,
          -3.8928e-01,  5.4844e-01, -1.2237e-01,  5.5231e-01, -6.3929e-01],
         [-9.5257e-01, -1.4470e+00, -5.2574e-01, -7.3909e-01,  5.3668e-01,
          -7.4040e-01, -1.0426e+00,  1.0805e-01, -1.4923e-01, -8.9584e-01,
          -6.7993e-01, -2.1030e-01, -2.6861e-04, -1.1963e-01,  9.8023e-02,
           5.3018e-01, 

In [254]:
d, w = torch.nn.MultiheadAttention(25, 1)(docs, docs, docs)
d.size(), w.size()

(torch.Size([2, 8, 25]), torch.Size([8, 2, 2]))

In [143]:
inst = jsr.text_to_instance(text=report)
#inst.fields['tokens'].index(vocab)
#inst.fields['tokens'].as_tensor(inst.get_padding_lengths())
vars(inst.fields['tokens'])

{'field_list': [<allennlp.data.fields.text_field.TextField at 0x1bfc77a58>,
  <allennlp.data.fields.text_field.TextField at 0x1bfc77d68>,
  <allennlp.data.fields.text_field.TextField at 0x1bfc9d4a8>,
  <allennlp.data.fields.text_field.TextField at 0x1bfc9d898>,
  <allennlp.data.fields.text_field.TextField at 0x1bfc9dc88>,
  <allennlp.data.fields.text_field.TextField at 0x1bb328320>,
  <allennlp.data.fields.text_field.TextField at 0x1bb328748>,
  <allennlp.data.fields.text_field.TextField at 0x1bb328dd8>]}

In [145]:
util.get_text_field_mask({'s1': vars(inst.fields['tokens'])['field_list'][0]})

AttributeError: 'TextField' object has no attribute 'dim'