In [3]:
import re
import treetaggerwrapper
import spacy
nlp = spacy.load('en_core_web_trf')
tagger = treetaggerwrapper.TreeTagger(TAGLANG='en', TAGDIR=r'\tt')

In [4]:
def preprocessing(text):
    # input example 
    #'To make a conclusion, I would like to underline the variety of book reviews at all and the difference between the given ones in particular.'
    user_input = text
    data = []
    # prepare text for splitting into sentences
    user_input = re.sub(r'\.', '. ', user_input)
    user_input = re.sub(r'\,', ', ', user_input)
    user_input = re.sub(r'\n', ' ', user_input)
    user_input = re.sub(r'\s\s', ' ', user_input)
    user_input = re.sub(r'\s\:', ':', user_input)
    user_input = re.sub(r'\s\;', ';', user_input)
    user_input = re.sub(r'\s\,', ',', user_input)
    user_input = re.sub(r'\.\s', '.\n', user_input)
    user_input = re.sub(r'\?\s', '?\n', user_input)
    user_input = re.sub(r'!\s', '!\n', user_input)
    
    sentences = user_input.split('\n')
    # tagging sentences
    for sentence in sentences:
        sentence_data = []
        if sentence != '':
            sentence_data.append(sentence)
            tags = tagger.tag_text(sentence)
            sp = []
            for tag in tags:
                try:
                    word, tag, lemma = tag.split('\t')
                except ValueError:
                    pass
                nl = '<' + word + ' ' + tag + '>'
                sp.append(nl)
            spstr = ''.join(sp)
            sentence_data.append(spstr)
        data.append(sentence_data)
    clean_data = []
    for sentence in data:
        if sentence != []:
            clean_data.append(sentence)
    data = clean_data
    # sent[0] - plaintext sentence, sent[1] - tagged sentence
    
    # output example
    # ['To make a conclusion, I would like to underline the variety of book reviews at all and the difference between the given ones in particular.',
    #  '<To TO0><make VVI><a AT0><conclusion NN1><, PUN><I PNP><would VM0><like VVI><to TO0><underline VVI><the AT0><variety NN1><of PRF><book NN1><reviews NN2><at PRP><all DT0><and CJC><the AT0><difference NN1><between PRP><the AT0><given AJ0><ones NN2><in PRP><particular AJ0><. SENT>']
    return data

In [5]:
def polarity(data):
    # Checks if any polarity items were used in the wrong context
    
    negation = r'(?:n|N)ot\b|n\'t|n’t|n`t|(?:n|N)ever(?!theless| the less)|\b(?:N|n)o\b|(?:N|n)owhere|(?:N|n)othing|(?:N|n)one|(?:N|n)oone|(?:N|n)either|'
    negation += r'(?:H|h)ardly|(?:S|s)carcely|(?:B|b)arely|^(?:F|f)ew|[^Aa] (?:F|f)ew|(?:N|n)or|(?:L|l)ack|(?:S|s)top'
    ifclause = r'\b(?:I|i)f\b|(?:W|w)hether'
    superlative = r'<most AV0><\w*? AJ.>|<\w+? AJS>'
    negative_lic = re.compile('|'.join((negation, ifclause)))
    neg_gr = r'\bat all|budg[eding]{0,3}\b|whatsoever|just yet|[^d,;:] yet'
    neg_exp = r'lift[a-z]{0,3} a finger|(sleep[a-z]{0,3}|slept) a wink|bat[a-z]{0,4} an eye|((takes?|took|taking)|(last[a-z]{0,3})) long\b|(drink[a-z]{0,3}|drank|drunk) a drop|(mean|small) feat'
    neg_exp += r'|put( \w+?| ) ?finger on|(thinks?|thought) much '
    temporal_neg_exp = r'in (?:hours|days|weeks(?! [0-9])|months(?! [JFMASOD])|years(?! gone| past| [a-zA-Z]*? ?[0-9])|decades|yonks|eons|a million years|ages(?! [0-9])|donkey\'s years)'
    neg_pol = re.compile('|'.join([neg_gr, neg_exp, temporal_neg_exp]))
    pos_pol = re.compile(r'already|would(n\'t| not)? rather|somewhat|\btoo\b')
    for_too = r'<too AV0><\w+? A..>|<too AV0><much|<too AV0><many'
    for_yet = r'VH[A-Z]>(<.*?>)?<yet ...>(<.*?>)?<.*? V[A-Z]N>|VH[A-Z]>(<.*?>)?<.*? V[A-Z]N>(<.*?>)?<yet ...>|<yet ...>(<.*?>)?VH[A-Z]>(<.*?>)?<.*? V[A-Z]N>|<(?:A|a)nd ...><yet'
    for_at_all = r'at ...><all ...><\w+? N|at ...><all ...><the|<any'
    exc = r'((?:N|n)ot|n\'t) (only|just)'
    
    def get_clauses(sentence):
        nlpdata = nlp(sentence)
        clauses = {}
        allcl = []
        for token in nlpdata:
            headw = token
            while True:
                if headw.dep_ in ['ROOT', 'relcl', 'advcl', 'rcmod', 'ccomp']:
                    break
                else:
                    headw = headw.head
            key = headw.i
            if clauses.get(key):
                clauses[key].append(token.text)
            else:
                clauses[key] = [token.text]
        for head in clauses.keys():
            clause = ' '.join(clauses[head])
            clause = re.sub(" n't", "n't", clause)
            clause = re.sub(" ' ","' ", clause)
            clause = re.sub(" 's", "'s", clause)
            allcl += [clause]
        return allcl
    
    output = open('output_with_spacy.txt', 'w', encoding = 'utf-8')
                
    for sent in data:
        sentence = sent[0]
        
        # if there is a negative polarity item but no prior negation
        neg = re.search(neg_pol, sentence)
        if sentence[-1] != '?' and neg:
            clauses = get_clauses(sentence)
            for clause in clauses:   
                neg = re.search(neg_pol, clause)
                if neg:
                    negstart = neg.start()
                    if not re.search(negative_lic, clause[:negstart]):
                        # check if there is a superlative adjective licensing the negation
                        superlative_forms = []
                        slts = re.findall(superlative, sent[1])
                        for slt in slts:
                            superl = re.search(r'<\w+? ', slt).group()[1:-1]
                            superlative_forms.append(superl)
                        if superlative_forms:
                            neg1 = neg.group().split()
                            if len(neg1) > 1:
                                neg1 = r'.{4,6}'.join(neg for neg in neg1)
                            else:
                                neg1 = neg1[0]
                            neg1 = re.search(neg1, sent[1]).start()
                            for slt in superlative_forms:
                                if not re.search(slt, sent[1][:neg1]):
                                    if 'yet' in neg.group() and not 'just' in neg.group():
                                        if re.search(for_yet, sent[1]):
                                            output.write(sentence+'\tNEGATIVE:'+neg.group()+'\n')
                                    elif 'at all' in neg.group():
                                        if not re.search(for_at_all, sent[1]):
                                            output.write(sentence+'\tNEGATIVE:'+neg.group()+'\n')
                                    else:
                                        output.write(sentence+'\tNEGATIVE:'+neg.group()+'\n')
                        else:
                            if 'yet' in neg.group() and not 'just' in neg.group():
                                if re.search(for_yet, sent[1]):
                                    output.write(sentence+'\tNEGATIVE:'+neg.group()+'\n')
                            elif 'at all' in neg.group():
                                if not re.search(for_at_all, sent[1]):
                                    output.write(sentence+'\tNEGATIVE:'+neg.group()+'\n')
                            else:
                                output.write(sentence+'\tNEGATIVE:'+neg.group()+'\n')
                
        # if there is a positive polarity item and negation before it
        pol = re.search(pos_pol, sentence)
        if pol:
            clauses = get_clauses(sentence)
            for clause in clauses: 
                pol = re.search(pos_pol, clause)
                if pol:
                    polstart = pol.start()
                    if re.search(negation, clause[:polstart]) and not re.search(r'\b(?:A|a)ny|'+exc, clause[:polstart]):
                        if 'too' in pol.group():
                            if not re.search(for_too, sent[1]):
                                output.write(sentence+'\tPOSITIVE:'+pol.group()+'\tTRIGGER:'+re.search(negation, clause[:polstart]).group()+'\n')
                        else:                
                            output.write(sentence+'\tPOSITIVE:'+pol.group()+'\tTRIGGER:'+re.search(negation, clause[:polstart]).group()+'\n')
    output.close()
    
    return data

In [7]:
with open(r'full_corpus\complete_corpus.txt', encoding = 'utf-8') as f:
    text = f.read()
data = preprocessing(text)

In [8]:
final = polarity(data)