In [88]:
import json
import pickle
import spacy
from spacy.pipeline import EntityRuler
from spacy.tokens import Doc, Span, Token
from spacy.symbols import nsubj, dobj
import neuralcoref
import pandas as pd

## Create spaCy Model and Lexicon

In [3]:
nlp = spacy.load("en_core_web_lg")

In [5]:
lexicon = pd.read_csv('/home/jon/Desktop/thesis/nfl.tsv', sep='\t', header=None, names=['word', 'rating', 'std'])
lexicon.drop('std', axis = 1, inplace = True)

for i in range(len(lexicon)):
    word = lexicon.iloc[i]['word']
    doc = nlp(word)
    lexicon.at[i, 'word'] = doc[0].lemma_

In [9]:
lexicon = lexicon.set_index(keys='word')
lexicon = lexicon.loc[~lexicon.index.duplicated(keep='first')]
lexicon = lexicon.to_dict('index')
sentiment_words = list(lexicon.keys())

## Modify Pipeline

In [12]:
with open('/home/jon/Desktop/thesis/2010/2010_processed.json', 'r') as f:
    articles = json.load(f)
teams = pickle.load(open('/home/jon/Desktop/thesis/2010/team_names.p', 'rb'))
players_to_teams = pickle.load(open('/home/jon/Desktop/thesis/2010/players_teams_mapping.p', 'rb'))

players = list(players_to_teams.keys())

In [13]:
ruler = EntityRuler(nlp, overwrite_ents=True)

In [14]:
patterns = [{'label': 'NFL', 'pattern': team} for team in teams]
name_patterns = [{'label': 'NFL', 'pattern': name} for name in players]
patterns.extend(name_patterns)

In [15]:
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

In [16]:
merge_ents = nlp.create_pipe("merge_entities")
nlp.add_pipe(merge_ents)

In [18]:
is_player_getter = lambda token: token.text in players
has_player_getter = lambda obj: any([t.text in players for t in obj])

is_team_getter = lambda token: token.text in teams
has_team_getter  =lambda obj: any([t.text in teams for t in obj])

is_sentiment_getter = lambda token: token.lemma_ in sentiment_words
has_sentiment_getter  =lambda obj: any([t.lemma_ in sentiment_words for t in obj])

In [19]:
Token.set_extension("is_player", getter=is_player_getter, force=True)
Doc.set_extension("has_player", getter=has_player_getter, force=True)
Span.set_extension("has_player", getter=has_player_getter, force=True)

Token.set_extension("is_team", getter=is_team_getter, force=True)
Doc.set_extension("has_team", getter=has_team_getter, force=True)
Span.set_extension("has_team", getter=has_team_getter, force=True)

Token.set_extension("is_sentiment_word", getter=is_sentiment_getter, force=True)
Doc.set_extension("has_sentiment_word", getter=has_sentiment_getter, force=True)
Span.set_extension("has_sentiment_word", getter=has_sentiment_getter, force=True)

## Document Experiments

In [81]:
doc = nlp(articles[2001]['content'])

In [91]:
entities = dict()
for ent in doc.ents:
    if ent.label_ == 'NFL':
        entities[ent.text] = 0

In [92]:
sentences = []
for sent in doc.sents:
    if sent._.has_team or sent._.has_player:
        sentences.append(sent)

In [93]:
for i, sent in enumerate(sentences):
    tokens = []
    for token in sent:
        if token._.is_player or token._.is_team or token._.is_sentiment_word:
            tokens.append(token)
        
    sentences[i] = tokens

In [94]:
for sent in sentences:
    sentiment = 0
    for token in sent[-1:]:
        if token._.is_sentiment_word:
            sentiment += lexicon[token.lemma_]['rating']
    for token in sent:
         if (token._.is_team or token._.is_player) and (token.dep == nsubj or token.dep == dobj):
            entities[token.text] += sentiment

In [95]:
entities

{'Cowboys': -2.05,
 'Colts': 0,
 'Tashard Choice': -2.26,
 'Marion Barber': -2.3200000000000003,
 'Flozell Adams': -2.8,
 'Vikings': 0,
 'Chris Johnson': 0.8,
 'Ray Rice': 0,
 'Jamaal Charles': 0}

In [87]:
doc

IRVING, Texas -- Can we put off plans to place a statue of Jason Garrett next to the one of Tom Landry outside of JerryWorld? Let's nitpick a little bit. After all, Jerry Jones basically begged us to expose his interim coach's warts despite the Cowboys being a fumble away from a four-game winning streak since Garrett grabbed the wheel of a 1-7 clunker. "What I do not want to do -- and I want to be real careful -- is to try to identify or couch Jason as, he can step in here and get it done," Jerry said in the visitors' locker room at Lucas Oil Stadium after the Cowboys' comeback overtime win over the defending AFC champion Colts. "Because if we don't get it done and times get hard, which that happens as a head coach, then all of the sudden the bloom is off the rose and everybody says he's overrated. It's a lot of expectations to get used to a guy coming in and winning 75 percent of his games right off the bat." OK, fine, you want to focus on a Garrett flaw? Look no further than the Cowb