In [110]:
import pandas as pd
from scipy.stats import chi2_contingency
from stanfordcorenlp import StanfordCoreNLP
import json, string
import collections
from nltk.corpus import stopwords

nlp = StanfordCoreNLP('http://localhost', port=9000, timeout=30000)

def lemmatize_corenlp(sentence, conn_nlp):
    props = {
        'annotators': 'pos,lemma',
        'pipelineLanguage': 'en',
        'outputFormat': 'json'
    }
    # tokenize into words
    sents = conn_nlp.word_tokenize(sentence)
    # remove punctuations from tokenised list
    sents_no_punct = [s for s in sents if s not in string.punctuation]
    # form sentence
    sentence2 = " ".join(sents_no_punct)
    # annotate to get lemma
    parsed_str = conn_nlp.annotate(sentence2, properties=props)
    parsed_dict = json.loads(parsed_str)
    # extract the lemma for each word
    lemma_list = [v for d in parsed_dict['sentences'][0]['tokens'] for k,v in d.items() if k == 'lemma']
    # form sentence and return it
    return " ".join(lemma_list)

def ngrams(text, n):
    text_list = text.split()
    # remove stop words
    filtered_text = [word for word in text_list if word not in stopwords.words('english')]
    return zip(*[filtered_text[i:] for i in range(n)])

def chi2(data, string):
    str_in_sentence = data['sentence'].apply(lambda s: string in s)
    str_in_sentence.name = string
    ct = pd.crosstab(data['gold_label'], str_in_sentence, margins=True)
    p_value = chi2_contingency(ct)[1]
    return p_value

def str_chi2(data, string):
    str_in_sentence = data['sentence'].apply(lambda s: string in s)
    str_in_sentence.name = string
    ct = pd.crosstab(data['gold_label'], str_in_sentence, margins=True)
    print(ct)
    p_value = chi2_contingency(ct)[1]
    print("P-value: %s" % p_value)
    return p_value

def find_relevant_ngrams(counts):
    strings_list = []
    for lemmas, count in counts.most_common(50):
        if len(lemmas) == 1:
            strings_list.append(lemmas[0])
        elif lemmas[1].startswith("'"):
            strings_list.append((''.join(lemmas)))
        else:
            strings_list.append((' '.join(lemmas)))
            
    relevant_ngrams = []        
    for ngram in strings_list:
        if float(chi2(df, ngram)) <= 0.01:
            relevant_ngrams.append(ngram)
            print(str_chi2(df, ngram))
    return relevant_ngrams

path = "C:/Users/Léo/Desktop/semeval-2020-task-5/data/"
train1 = pd.read_csv(path + 'train/subtask1.csv')
train1.set_index('sentenceID', inplace=True)

train2 = pd.read_csv(path + 'train/data_train_subtask2.csv')
train2 = train2[["sentenceID", "sentence"]]
train2.set_index('sentenceID', inplace=True)
train2.insert(1, "gold_label", "1")

trial2 = pd.read_csv(path + 'trial/subtask2-c770635c39b2f26ed7f10041444ea6b230fb5ff6.csv')
trial2 = trial2[["ID", "gold_label", "sentence"]]
trial2.set_index('ID', inplace=True)

trial1 = pd.read_csv(path + 'trial/subtask1-c770635c39b2f26ed7f10041444ea6b230fb5ff6.csv')
trial1 = trial1[["ID", "gold_label", "sentence"]]
trial1.set_index('ID', inplace=True)
df['gold_label'] = df['gold_label'].apply(lambda x: str(x))

# concatenate data and remove duplicates
df = pd.concat([train1, train2, trial1, trial2])
df = df.reset_index(drop=True)
df_gpby = df.groupby(list(df.columns))
idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
df = df.reindex(idx)
df['gold_label'] = df['gold_label'].apply(str)    

df['sentence'] = df['sentence'].apply(lemmatize_corenlp, conn_nlp=nlp)

sentences_list = df['sentence'].tolist()

unigram_counts = collections.Counter()
bigram_counts = collections.Counter()
trigram_counts = collections.Counter()

for sentence in sentences_list:
    unigram_counts += collections.Counter(ngrams(sentence, 1))
    bigram_counts += collections.Counter(ngrams(sentence, 2))
    trigram_counts += collections.Counter(ngrams(sentence, 3))
    
print(find_relevant_ngrams(unigram_counts))
print(find_relevant_ngrams(bigram_counts))
print(find_relevant_ngrams(trigram_counts))

# would       False  True    All
# gold_label                    
# 0            8805  4033  12838
# 1            1736  3824   5560
# All         10541  7857  18398
# P-value: 0.0
# 0.0
# say         False  True    All
# gold_label                    
# 0            9268  3570  12838
# 1            4238  1322   5560
# All         13506  4892  18398
# P-value: 1.6627570035001988e-06
# 1.6627570035001988e-06
# I           False  True    All
# gold_label                    
# 0           10827  2011  12838
# 1            4020  1540   5560
# All         14847  3551  18398
# P-value: 8.635364367034171e-77
# 8.635364367034171e-77
# 's          False  True    All
# gold_label                    
# 0           10771  2067  12838
# 1            4776   784   5560
# All         15547  2851  18398
# P-value: 0.018501068274006806
# 0.018501068274006806
# may         False  True    All
# gold_label                    
# 0           10307  2531  12838
# 1            5326   234   5560
# All         15633  2765  18398
# P-value: 8.663263620485288e-157
# 8.663263620485288e-157
# Mr.         False  True    All
# gold_label                    
# 0           11813  1025  12838
# 1            5318   242   5560
# All         17131  1267  18398
# P-value: 1.9246988525183267e-16
# 1.9246988525183267e-16
# even        False  True    All
# gold_label                    
# 0           11193  1645  12838
# 1            5070   490   5560
# All         16263  2135  18398
# P-value: 2.2418756854616065e-12
# 2.2418756854616065e-12
# year        False  True    All
# gold_label                    
# 0           11951   887  12838
# 1            5083   477   5560
# All         17034  1364  18398
# P-value: 0.0033550280330856786
# 0.0033550280330856786
# get         False  True    All
# gold_label                    
# 0           11637  1201  12838
# 1            5194   366   5560
# All         16831  1567  18398
# P-value: 9.86166778181408e-08
# 9.86166778181408e-08
# take        False  True    All
# gold_label                    
# 0           11903   935  12838
# 1            5271   289   5560
# All         17174  1224  18398
# P-value: 1.841654604216465e-05
# 1.841654604216465e-05
# Trump       False  True    All
# gold_label                    
# 0           12062   776  12838
# 1            5351   209   5560
# All         17413   985  18398
# P-value: 4.336188067930147e-08
# 4.336188067930147e-08
# company     False  True    All
# gold_label                    
# 0           12209   629  12838
# 1            5367   193   5560
# All         17576   822  18398
# P-value: 0.0009660975847616988
# 0.0009660975847616988
# see         False  True    All
# gold_label                    
# 0           11914   924  12838
# 1            5246   314   5560
# All         17160  1238  18398
# P-value: 0.005023701689810834
# 0.005023701689810834
# market      False  True    All
# gold_label                    
# 0           12259   579  12838
# 1            5398   162   5560
# All         17657   741  18398
# P-value: 3.847547511983186e-05
# 3.847547511983186e-05
# use         False  True    All
# gold_label                    
# 0           10993  1845  12838
# 1            4915   645   5560
# All         15908  2490  18398
# P-value: 4.082925739593689e-05
# 4.082925739593689e-05
# also        False  True    All
# gold_label                    
# 0           12294   544  12838
# 1            5430   130   5560
# All         17724   674  18398
# P-value: 5.09975221902103e-08
# 5.09975221902103e-08
# wish        False  True    All
# gold_label                    
# 0           12514   324  12838
# 1            5190   370   5560
# All         17704   694  18398
# P-value: 2.2847877379037654e-38
# 2.2847877379037654e-38
# Mr          False  True    All
# gold_label                    
# 0           11379  1459  12838
# 1            5125   435   5560
# All         16504  1894  18398
# P-value: 9.960058411948781e-11
# 9.960058411948781e-11
# need        False  True    All
# gold_label                    
# 0           12284   554  12838
# 1            5457   103   5560
# All         17741   657  18398
# P-value: 5.092773363595326e-14
# 5.092773363595326e-14
# ask         False  True    All
# gold_label                    
# 0           12288   550  12838
# 1            5398   162   5560
# All         17686   712  18398
# P-value: 0.0006022313083456218
# 0.0006022313083456218
# new         False  True    All
# gold_label                    
# 0           12134   704  12838
# 1            5385   175   5560
# All         17519   879  18398
# P-value: 1.8961247354517506e-09
# 1.8961247354517506e-09
# come        False  True    All
# gold_label                    
# 0           11889   949  12838
# 1            5247   313   5560
# All         17136  1262  18398
# P-value: 0.0008348489459534442
# 0.0008348489459534442
# much        False  True    All
# gold_label                    
# 0           12491   347  12838
# 1            5322   238   5560
# All         17813   585  18398
# P-value: 2.5760631078039388e-06
# 2.5760631078039388e-06
# want        False  True    All
# gold_label                    
# 0           12396   442  12838
# 1            5417   143   5560
# All         17813   585  18398
# P-value: 0.048533379611431365
# 0.048533379611431365
# government  False  True    All
# gold_label                    
# 0           12382   456  12838
# 1            5438   122   5560
# All         17820   578  18398
# P-value: 0.00010045416947170458
# 0.00010045416947170458
# patient     False  True    All
# gold_label                    
# 0           12444   394  12838
# 1            5461    99   5560
# All         17905   493  18398
# P-value: 5.787546608867359e-05
# 5.787546608867359e-05
# risk        False  True    All
# gold_label                    
# 0           12430   408  12838
# 1            5483    77   5560
# All         17913   485  18398
# P-value: 7.060107920704806e-10
# 7.060107920704806e-10
# bank        False  True    All
# gold_label                    
# 0           12343   495  12838
# 1            5440   120   5560
# All         17783   615  18398
# P-value: 5.61474057410212e-07
# 5.61474057410212e-07
# change      False  True    All
# gold_label                    
# 0           12362   476  12838
# 1            5443   117   5560
# All         17805   593  18398
# P-value: 1.933495561948113e-06
# 1.933495561948113e-06
# ['would', 'say', 'I', "'s", 'may', 'Mr.', 'even', 'year', 'get', 'take', 'Trump', 'company', 'see', 'market', 'use', 'also', 'wish', 'Mr', 'need', 'ask', 'new', 'come', 'much', 'want', 'government', 'patient', 'risk', 'bank', 'change']
# number would  False  True    All
# gold_label                      
# 0             12836     2  12838
# 1              5549    11   5560
# All           18385    13  18398
# P-value: 0.001101243893872374
# 0.001101243893872374
# ['number would']
# []

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




would       False  True    All
gold_label                    
0            8805  4033  12838
1            1736  3824   5560
All         10541  7857  18398
P-value: 0.0
0.0
say         False  True    All
gold_label                    
0            9268  3570  12838
1            4238  1322   5560
All         13506  4892  18398
P-value: 1.6627570035001988e-06
1.6627570035001988e-06
I           False  True    All
gold_label                    
0           10827  2011  12838
1            4020  1540   5560
All         14847  3551  18398
P-value: 8.635364367034171e-77
8.635364367034171e-77
may         False  True    All
gold_label                    
0           10307  2531  12838
1            5326   234   5560
All         15633  2765  18398
P-value: 8.663263620485288e-157
8.663263620485288e-157
Mr.         False  True    All
gold_label                    
0           11813  1025  12838
1            5318   242   5560
All         17131  1267  18398
P-value: 1.9246988525183267e-16
1.92469885251

I wish I    False  True    All
gold_label                    
0           12837     1  12838
1            5460   100   5560
All         18297   101  18398
P-value: 3.7598872252703154e-48
3.7598872252703154e-48
health care provider  False  True    All
gold_label                              
0                     12807    31  12838
1                      5560     0   5560
All                   18367    31  18398
P-value: 0.009280371823436426
0.009280371823436426
I I would   False  True    All
gold_label                    
0           12838     0  12838
1            5552     8   5560
All         18390     8  18398
P-value: 0.0009940843493853
0.0009940843493853
wish I could  False  True    All
gold_label                      
0             12838     0  12838
1              5535    25   5560
All           18373    25  18398
P-value: 8.391782459656328e-12
8.391782459656328e-12
think I would  False  True    All
gold_label                       
0              12836     2  12838
1           