In [None]:
data = json.load(open('data.json'))

In [None]:
len(data)

In [None]:
import pandas as pd

df=pd.DataFrame(data)

In [None]:
df.head()

In [None]:
import spacy

In [None]:
nlp=spacy.load('en')

In [None]:
#smaller sample
dfs=df.iloc[:100]

# Single sentence analysis

In [None]:
text=df.text[3]
opinions=df.opinions[3]

In [None]:
doc=nlp(text)

doc

In [None]:
opinions

### Entities

In [None]:
[nc for nc in doc.noun_chunks]

In [None]:
[nc.root for nc in doc.noun_chunks]

In [None]:
[token for token in doc if token.pos_=='NOUN']

In [None]:
doc.ents

### Sentiment terms

In [None]:
[token for token in doc
        if not token.is_stop
        if not token.is_punct
        if token.pos_ == 'ADJ' or token.pos_ == 'VERB']

## Splitting

In [None]:
def dep_vis(doc):
    spacy.displacy.render(doc, style='dep', jupyter=True, options={'distance': 100})

## Back to task

In [None]:
doc=nlp(text)
sss=SingleSentenceSplitter()

In [None]:
dep_vis(doc)

In [None]:
ss=list(sss(doc))

In [None]:
list(doc.noun_chunks)

In [None]:
t=ss[2][2]

In [None]:
def strategy(token: Token) -> bool:
    return (token.pos_ in {'VERB', 'ADJ'}
            and token.dep_ in {'ccomp', 'conj'}
            and 'NOUN' in [t.pos_ for t in token.subtree]) or token.dep_=='ROOT'

In [None]:
sss=SingleSentenceSplitter(strategy)

In [None]:
subsents=list(sss(doc))

In [None]:
subsents

In [None]:
'food' in [token.lemma_ for token in subsents[0]]

In [None]:
def stringize(subsent):
    return ' '.join([token.orth_ for token in subsent])

In [None]:
'food' in stringize(subsents[0])

In [None]:
def get_subsents(text):
    doc=nlp(text)
    return list(sss(doc))

In [None]:
dfs['subsents']=dfs.text.apply(get_subsents)

In [None]:
dfs['sub_number']=dfs['subsents'].str.len()

In [None]:
(dfs['opinions'].str.len()==dfs['sub_number']).head()

In [None]:
dfs.iloc[1].opinions

In [None]:
dfs.iloc[1].subsents

In [None]:
(dfs['opinions'].str.len()>dfs['sub_number']).sum()

In [None]:
dfs[dfs['opinions'].str.len()>dfs['sub_number']]

In [None]:
dfs.iloc[89].opinions

In [None]:
dfs.iloc[89].text

In [None]:
dfs.iloc[89].subsents

In [None]:
doc=nlp(dfs.iloc[89].text)

In [None]:
dep_vis(doc)

In [None]:
# add 'advcl'
def strategy2(token: Token) -> bool:
    return (token.pos_ in {'VERB', 'ADJ'}
            and token.dep_ in {'ccomp', 'conj', 'advcl'}
            and 'NOUN' in [t.pos_ for t in token.subtree]) or token.dep_=='ROOT'

In [None]:
sss=SingleSentenceSplitter(strategy2)
def get_subsents(text):
    doc=nlp(text)
    return list(sss(doc))

In [None]:
dfs['subsents']=dfs.text.apply(get_subsents)

In [None]:
dfs['sub_number']=dfs['subsents'].str.len()

In [None]:
(dfs['opinions'].str.len()>dfs['sub_number']).sum()

In [None]:
dfs[dfs['opinions'].str.len()>dfs['sub_number']]

In [None]:
dfs.iloc[69].opinions

In [None]:
dfs.iloc[69].subsents

In [None]:
dep_vis(nlp(dfs.iloc[69].text))

In [None]:
dfs.iloc[89].opinions, dfs.iloc[89].subsents

In [None]:
doc = nlp(dfs.iloc[89].text)

In [None]:
list(doc.noun_chunks)

In [None]:
def get_matches(full_sent, subsents, opinions):
    matches=[]

    subsent_strings=[stringize(subsent) for subsent in subsents]
    for opinion in opinions:
        if opinion['from']==opinion['to']==0:
            matches.append((full_sent, opinion))
        else:
            for i, subsent in enumerate(subsent_strings):
                if opinion['target'] in subsent:
                    matches.append((subsent, opinion))
                
    return matches

In [None]:
dfs.subsents[3], dfs.opinions[3]

In [None]:
get_matches(dfs.text[3], dfs.subsents[3], dfs.opinions[3])

In [None]:
dfs.subsents[3][0][0]

In [None]:
aligned=dfs.apply(lambda row: get_matches(row.text, row.subsents, row.opinions), axis=1)

In [None]:
dfs.iloc[24]

In [None]:
pairs=[pair for pairs in list(aligned)
for pair in pairs]

In [None]:
len(pairs)

In [None]:
len([opinion for opinions in list(dfs['opinions'])
for opinion in opinions])

In [None]:
len(pairs)

In [None]:
pairs=[(pair[0], pair[1]['category']) for pair in pairs]

In [None]:
data=pd.DataFrame(pairs, columns=['text', 'label'])

In [None]:
data.fillna('NONE', inplace=True)

In [None]:
labels=list(data['label'].unique())
labels

In [None]:
textcat = nlp.create_pipe(
            "textcat",
            config={
                "exclusive_classes": True,
                "architecture": "bow",
            }
        )

In [None]:
nlp.remove_pipe('textcat')

In [None]:
nlp.add_pipe(textcat, last=True)

In [None]:
for label in labels:
    textcat.add_label(label)

In [None]:
textcat.labels

In [None]:
data.head()

In [None]:
def convert_label(label):
    return {'cats': {l: l==label for l in labels}}

In [None]:
convert_label('RESTAURANT#GENERAL')

In [None]:
converted_data = list(data.apply(lambda row: (row.text, convert_label(row.label)), axis=1))

## Model training

In [None]:
import random

def split(converted_data, split=0.8, random_seed=10):
    random.seed(random_seed)
    random.shuffle(converted_data)
    n = int(len(converted_data) * split)
    return (converted_data[:n], converted_data[n:])

In [None]:
train, validate = split(converted_data)

In [None]:
len(train), len(validate)

In [None]:
def evaluate(tokenizer, textcat, valid_data):
    texts, labels = zip(*valid_data)
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    
    for i, doc in enumerate(textcat.pipe(docs)):
        y_true = labels[i]
        for label, score in doc.cats.items():
            if label not in y_true:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and y_true[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and y_true[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and y_true[label] < 0.5:
                tn += 1
            elif score < 0.5 and y_true[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


In [None]:
from spacy.util import minibatch, compounding

def train_model(textcat, train_data, val_data, n_iter):
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, val_data)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

In [None]:
train_model(textcat, train, validate, 10)

In [None]:
from textblob.sentiments import PatternAnalyzer

In [None]:
analyzer=PatternAnalyzer()

In [None]:
analyzer.analyze(doc.text).polarity

In [None]:
def get_sentiment(text):
    return analyzer.analyze(doc.text).polarity

In [None]:
data.text.apply(get_sentiment)

In [None]:
#We need to use other lexicon method - see in the_guardian for method in textacy, or use nltk.vader

In [None]:
# or - better option get all ADJs and VERBs and build our own lexicon