In [4]:
import re
import treetaggerwrapper
tagger = treetaggerwrapper.TreeTagger(TAGLANG='en', TAGDIR=r'\\tt')

In [5]:
def preprocessing(text):
    user_input = text
    data = []
    # prepare text for splitting into sentences
    user_input = re.sub(r'\.', '. ', user_input)
    user_input = re.sub(r'\,', ', ', user_input)
    user_input = re.sub(r'\n', ' ', user_input)
    user_input = re.sub(r'\s\s', ' ', user_input)
    user_input = re.sub(r'\s\:', ':', user_input)
    user_input = re.sub(r'\s\;', ';', user_input)
    user_input = re.sub(r'\s\,', ',', user_input)
    user_input = re.sub(r'\.\s', '.\n', user_input)
    user_input = re.sub(r'\?\s', '?\n', user_input)
    user_input = re.sub(r'!\s', '!\n', user_input)
    
    sentences = user_input.split('\n')
    # tagging sentences
    for sentence in sentences:
        sentence_data = []
        if sentence != '':
            sentence_data.append(sentence)
            tags = tagger.tag_text(sentence)
            sp = []
            for tag in tags:
                try:
                    word, tag, lemma = tag.split('\t')
                except ValueError:
                    pass
                nl = '<' + word + ' ' + tag + '>'
                sp.append(nl)
            spstr = ''.join(sp)
            sentence_data.append(spstr)
        data.append(sentence_data)
    clean_data = []
    for sentence in data:
        if sentence != []:
            clean_data.append(sentence)
    data = clean_data
    # sent[0] - plaintext sentence, sent[1] - tagged sentence
    return data

In [10]:
def polarity(data):
    # Checks if any polarity items were used in the wrong context

    
    negation = r'(?:n|N)ot\b|n\'t|n’t|n`t|(?:n|N)ever|\b(?:N|n)o\b|(?:N|n)owhere|(?:N|n)othing|(?:N|n)one|(?:N|n)oone|(?:N|n)either|'
    negation += r'(?:H|h)ardly|(?:S|s)carcely|(?:B|b)arely|^(?:F|f)ew|[^Aa] (?:F|f)ew|(?:N|n)or|(?:L|l)ack|(?:S|s)top'
    ifclause = r'\b(?:I|i)f\b|(?:W|w)hether'
    superlative = r'<most AV0><\w*? AJ.>|<\w+? AJS>|<\w+? ORD>'
    negative_lic = re.compile('|'.join((negation, ifclause)))
    neg_gr = r'\bat all|budg[eding]{0,3}\b|\bany ?more|\bever(?!-|\w)|whatsoever|just yet|[^d,;:] yet'
    neg_exp = r'lift[a-z]{0,3} a finger|(sleep[a-z]{0,3}|slept) a wink|bat[a-z]{0,4} an eye|((takes?|took|taking)|(last[a-z]{0,3})) long\b|(drink[a-z]{0,3}|drank|drunk) a drop|(mean|small) feat'
    neg_exp += r'|put( \w+?| ) ?finger on|(thinks?|thought) much of'
    temporal_neg_exp = r'in (?:hours|days|weeks(?! [0-9])|months(?! [JFMASOD])|years(?! gone| past| [a-zA-Z]*? ?[0-9])|decades|yonks|eons|a million years|ages(?! [0-9])|donkey\'s years)'
    neg_pol = re.compile('|'.join([neg_gr, neg_exp, temporal_neg_exp]))
    pos_pol = re.compile(r'already|would(n\'t| not) rather|somewhat|\btoo\b')
    for_too = r'<too AV0><\w+? A..>|<too AV0><much|<too AV0><many'
    for_ever = r'VH[A-Z]>(<.*?>)?<ever ...>(<.*?>)?<.*? V[A-Z]N>|VH[A-Z]>(<.*?>)?<.*? V[A-Z]N>(<.*?>)?<ever ...>|<ever ...>(<.*?>)?VH[A-Z]>(<.*?>)?<.*? V[A-Z]N>|<ever ...><since|<ever ...><before'
    for_yet = r'VH[A-Z]>(<.*?>)?<yet ...>(<.*?>)?<.*? V[A-Z]N>|VH[A-Z]>(<.*?>)?<.*? V[A-Z]N>(<.*?>)?<yet ...>|<yet ...>(<.*?>)?VH[A-Z]>(<.*?>)?<.*? V[A-Z]N>|<(?:A|a)nd ...><yet'
    for_at_all = r'at ...><all ...><\w+? N|at ...><all ...><the'
    exc = re.compile(r'((?:N|n)ot|n\'t) (only|just)')
    clause_marker = re.compile(r'<[^ ]*? PUN>|<\w+? CJ[A-Z]?>|<\w+? PNQ>')
    word = r'<[^ ]+? '
    coord_conj1 = r'([A-Z0-9]{3})>'
    coord_conj2= r'<\w+? \1>'
    
    def find_beginnings_of_clauses(sentence, tagged):
        clause_beginnings = [0]
        cms = re.findall(clause_marker, sent[1])
        for cm in cms:
            marker = re.search(word, cm).group()[1:-1] + r'\W'
            markstart = re.search(cm, tagged).start()
            if markstart > 0:
                tagged = tagged[markstart-4:]
            if not re.match(coord_conj1+cm+coord_conj2, tagged):
                try:
                    clause_beginnings.append(1+clause_beginnings[-1]+re.search(marker, sentence).start())
                    prev_clause = sentence
                    sentence = sentence[re.search(marker, sentence).start()+1:]
                except AttributeError:
                    clause_beginnings.append(1+clause_beginnings[-1]+re.search(marker[:-3], sentence).start())
                    prev_clause = sentence
                    sentence = sentence[re.search(marker[:-3], sentence).start()+1:]
        if 1 in clause_beginnings:
            clause_beginnings.remove(1)
        return clause_beginnings
                
    output = open('output_sans_spacy.txt', 'w', encoding = 'utf-8')
    
    for sent in data:
        sentence = sent[0]
        tagged = sent[1]
        
        # if there is a negative polarity item but no prior negation
        neg = re.search(neg_pol, sentence)
        if sentence[-1] != '?' and neg:
            negstart = neg.start()
            clause_beginnings = find_beginnings_of_clauses(sentence, tagged)
            cln = clause_beginnings+[negstart]
            cln = sorted(cln)
            needed_clause = cln[cln.index(negstart)-1]
            if needed_clause == 0:
                needed_clause = 1
            if not re.search(negative_lic, sentence[needed_clause-1:negstart]):
                # check if there is a superlative adjective licensing the negation
                #print(sentence[needed_clause-1:negstart], needed_clause-1, negstart)
                superlative_forms = []
                slts = re.findall(superlative, sent[1])
                for slt in slts:
                    superl = re.search(r'<\w+? ', slt).group()[1:-1]
                    superlative_forms.append(superl)
                for slt in superlative_forms:
                    neg1 = neg.group().split()
                    if len(neg1) > 1:
                        neg1 = r'.{4,6}'.join(neg for neg in neg1)
                    else:
                        neg1 = neg1[0]
                    neg1 = re.search(neg1, sent[1]).start()
                    if not re.search(slt, sent[1][:neg1]):
                        if 'yet' in neg.group() and not 'just' in neg.group():
                            if re.search(for_yet, sent[1]):
                                output.write(sentence+'\tNEGATIVE\t'+neg.group()+'\n')
                        elif 'ever' in neg.group():
                            if not re.search(for_ever, sent[1]):
                                output.write(sentence+'\tNEGATIVE\t'+neg.group()+'\n')
                        elif 'at all' in neg.group():
                            if not re.search(for_at_all, sent[1]):
                                output.write(sentence+'\tNEGATIVE\t'+neg.group()+'\n')
                        else:
                            output.write(sentence+'\tNEGATIVE\t'+neg.group()+'\n')
                if not superlative_forms:
                    if 'yet' in neg.group() and not 'just' in neg.group():
                        if re.search(for_yet, sent[1]):
                            output.write(sentence+'\tNEGATIVE\t'+neg.group()+'\n')
                    elif 'ever' in neg.group():
                        if not re.search(for_ever, sent[1]):
                            output.write(sentence+'\tNEGATIVE\t'+neg.group()+'\n')
                    elif 'at all' in neg.group():
                        if not re.search(for_at_all, sent[1]):
                            output.write(sentence+'\tNEGATIVE\t'+neg.group()+'\n')
                    else:
                        output.write(sentence+'\tNEGATIVE\t'+neg.group()+'\n')
                
        # if there is a positive polarity item and negation before it
        pol = re.search(pos_pol, sentence)
        if pol:
            polstart = pol.start()
            clause_beginnings = find_beginnings_of_clauses(sentence, tagged)
            clp = clause_beginnings+[polstart]
            clp = sorted(clp)
            needed_clause = clp[clp.index(polstart)-1]
            if needed_clause == 0:
                needed_clause = 1
            if re.search(negation, sentence[needed_clause-1:polstart]) and not re.search(r'\b(?:A|a)ny', sentence[needed_clause-1:polstart]):
                #print(sentence[needed_clause-1:polstart])
                if 'too' in pol.group():
                    if not re.search(for_too, sent[1]):
                        output.write(sentence+'\tPOSITIVE\t'+pol.group()+'\tTRIGGER\t'+re.search(negation, sentence[needed_clause:polstart]).group()+'\n')
                else:                
                    output.write(sentence+'\tPOSITIVE\t'+pol.group()+'\tTRIGGER\t'+re.search(negation, sentence[needed_clause:polstart]).group()+'\n')
    
    output.close()
    
    return data

In [7]:
with open(r'input.txt', encoding = 'utf-8') as f:
    text = f.read()
data = preprocessing(text)

In [11]:
final = polarity(data)