In [17]:
# evaluating the cause/effect pairs extracted from the pipeline. 
# We need to implement some post-processing on the textbook and squad as well as 
# set up a manual evaluation pipeline
# Filter out questions, figure references , etc from being included in causal extraction
import numpy as np
import pandas as pd
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from collections import Counter
import csv
import codecs
textbook_ce = pd.read_csv('textbook_ce.csv')
squad_ce = pd.read_csv('squad_ce.csv')
#preprocess squad data
squad_ce.rename({'Pattern_id': 'PatternID'}, axis=1, inplace=True)

def add_space_period(txt):
    return txt.replace('.', '. ')

squad_ce['Text'] = squad_ce['Text'].transform(add_space_period)
# df = pd.read_csv('squad_ce.csv')
# patterns = pd.read_csv('CE_extractor--Patterns_Based/CE_extractor/patterns.csv')
patterns = pd.read_csv('patterns.csv')
#pid join PatternID	

def view_df(df):
    for i, row in df.iterrows():
        print(i,"-------------------------")
        print('Text:')
        print(row['Text'])
        print('Cause:')
        print(row['Cause'])
        print('Effect:')
        print(row['Effect'])

def is_question(txt):
    sents = sent_tokenize(txt)
    if len(sents) >1:
        return "?" in sents[1]
    return False

def cue_is_CC_IN(txt, cues):
    # is so is conjunction or subordinate conjunction
    sents = sent_tokenize(txt)
    first_slen = len(word_tokenize(sents[0]))
    seen_cue = False
    if len(sents) ==3:
        word_lst = word_tokenize(txt)
        tags = pos_tag(word_lst)
        for i,tup in enumerate(tags):#('So', 'CC')
            if tup[0] in cues and i>=first_slen:
                if not seen_cue and tup[1] in ['CC', 'IN'] and i>=first_slen:
                    return True
                seen_cue = True
    elif len(sents) ==2:
        word_lst = word_tokenize(txt)
        tags = pos_tag(word_lst)
        for i,tup in enumerate(tags):#('So', 'CC')
            if tup[0] in cues:
                if not seen_cue and tup[1] in ['CC', 'IN']:
                    return True
                seen_cue = True
    return False

def as_at_start_CE(txt, iscause):
    #new as pattern 144: As &C, &R
    sents = sent_tokenize(txt)
    target = ''
    if len(sents) ==3:
        target = sents[1]
    else:
        for sent in sents:
            if "As" in sent or 'as' in sent:
                target = sent 
    if "As" in word_tokenize(target) and ',' in word_tokenize(target):
        ind = target.find(',')
        cause, effect = ' '.join(word_tokenize(target[:ind])[1:]), target[ind+1:].strip() #todo
        return cause if iscause else effect
    return ''

def as_at_start(txt): #(169/475)
    #returns if as at start, filter out cause that are NP
    sents = sent_tokenize(txt)
    target = ''
    if len(sents) ==3:
        target = sents[1]
    else:
        for sent in sents:
            if "As" in sent or 'as' in sent:
                target = sent 
    if "As" in word_tokenize(target) and ',' in word_tokenize(target):
        return True
    return False

def process_df(ce):
    print("original", len(ce))
    df = pd.merge(ce, patterns, left_on="PatternID", right_on="pid")
    # merged 3977

    #-cause figure: 3851, -effect3724
    df = df[~df['Cause'].str.contains("Figure")&~df['Effect'].str.contains("Figure")]
    print("no figure", len(df))
    # merged[merged['maintoken'].str.contains("'so'")]
    #as: pattern 80 has 603 rows
    # merged[merged['pid']==80]

    #When question generated is the same for both cause and effect, 
    #potential indication of poor causal relation extraction
    #188 this is evaluation
    # df = df[~(df['cause_question'] == df['effect_question'])]

    #filter out questions from CE 3887 !!!!! if middle sentence has ? instead
    # df = df[~df['Cause'].str.contains("\?")&~df['Effect'].str.contains("\?")]
    df = df[df['Text'].apply(is_question) == False]
    print("no question", len(df))
    #effect and cause are the same 0
    # df[df['Cause'] == df['Effect']]
    # print(df['pid'].value_counts())
    # df['textbook_freq'] = df.groupby('pid')['pid'].transform('count')
    
    #SO pattern 0 POS filter
    df = df[(df['pid']!=0) | (df['Text'].apply(cue_is_CC_IN, cues=['So', 'so']) == True)] #669-399
    print("process so", len(df))
    #AS pattern 80 POS filter
    df = df[(df['pid']!=80)|(df['Text'].apply(cue_is_CC_IN, cues=['As', 'as']) == True)] #493-475
    print("process as", len(df))
    #Since pattern 79 78 POS filter
    df = df[((df['pid']!=79) & (df['pid']!=78)) | (df['Text'].apply(cue_is_CC_IN, cues=['Since', 'since']) == True)] #
    print("process since", len(df))
    return df

def as_new_df(df, newpid):
    #takes in just pattern 80
    df = df[df['pid']==80]
    df = df[df['Text'].apply(as_at_start) == True] #
    df['Cause'] = df['Text'].apply(as_at_start_CE, iscause=True)
    df['Effect'] = df['Text'].apply(as_at_start_CE, iscause=False)
    df['PatternID'] = newpid
    return df[['PatternID', 'Text','Cause', 'Effect']]

processed_tce = process_df(textbook_ce)
processed_sce = process_df(squad_ce)

as_squad_ce = as_new_df(processed_sce, 144)
print('as new pattern in squad', len(as_squad_ce))
as_textbook_ce = as_new_df(processed_tce, 144)
print('as new pattern in textbook', len(as_textbook_ce))


# pd.set_option('display.max_rows', None)
# new_df.to_csv("new_as_pattern_CE_textbook.csv")
# print(textbook_ce['PatternID'].max())

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emily/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
original 3977
no figure 3724
no question 3621
process so 3378
process as 3360
process since 3359
original 1106
no figure 1106
no question 1106
process so 1060
process as 1056
process since 1054
as new pattern in squad 50
as new pattern in textbook 169


In [248]:
def replace_as_new(ce, newpid):
#     ce[(ce['pid']==80) &(ce['Text'].apply(as_at_start) == True)]['Cause'] = 
#     ce['Cause'] = ce['Text'].apply(as_at_start_CE, iscause=True)
#     ce['Effect'] = ce['Text'].apply(as_at_start_CE, iscause=False)
#     df['PatternID'] = newpid
    for i, row in ce.iterrows():
        if ce['pid'][i]==80 and as_at_start(ce['Text'][i]):
            ce['Cause'][i] = as_at_start_CE(ce['Text'][i], True)
            ce['Effect'][i] = as_at_start_CE(ce['Text'][i], False)
            ce['PatternID'][i] = newpid
    return ce

# print(len(as_squad_ce))
    
# print(squad_ce['PatternID'].value_counts())
# processed_sce['PatternID'].value_counts()
# as_squad_ce.to_csv("new_as_pattern_CE_squad.csv")
final_textbook = replace_as_new(processed_tce, 144)
final_textbook[['PatternID', 'Text', 'Cause', 'Effect', 'cause_question','effect_question']].to_csv("textbook_ce_processed.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [249]:
#view_df(final_textbook[final_textbook['PatternID']==144])
len(final_textbook)
final_squad = replace_as_new(processed_sce, 144)
final_squad[['PatternID', 'Text', 'Cause', 'Effect', 'cause_question','effect_question']].to_csv("squad_ce_processed.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [251]:
#TESTTTT
view_df(squad_ce[squad_ce['PatternID']==68])
# view_df(processed_sce[processed_sce['PatternID']==68])
squad_ce[squad_ce['PatternID']==68]
patterns[patterns['pid'] ==79]
view_df(final_squad[final_squad['PatternID']==144])
t = 'The contemporaneous French philosopher Voltaire referred to Canada disparagingly as nothing more than a few acres of snow .The British , for their part , were happy to take New France , as defence of their North American colonies would no longer be an issue and also because they already had ample places from which to obtain sugar .Spain , which traded Florida to Britain to regain Cuba , also gained Louisiana , including New Orleans , from France in compensation for its losses .'
print(t)

cue_is_CC_IN(add_space_period(t), ['As', 'as'])
print(squad_ce['PatternID'].value_counts())


The contemporaneous French philosopher Voltaire referred to Canada disparagingly as nothing more than a few acres of snow .The British , for their part , were happy to take New France , as defence of their North American colonies would no longer be an issue and also because they already had ample places from which to obtain sugar .Spain , which traded Florida to Britain to regain Cuba , also gained Louisiana , including New Orleans , from France in compensation for its losses .
80     237
15      66
0       62
79      58
68      58
14      53
76      42
44      39
2       39
72      32
81      32
56      32
67      30
1       29
78      29
75      24
48      22
86      19
71      18
29      17
47      16
77      15
55      10
82      10
5       10
95       8
115      7
17       7
135      7
4        6
90       6
16       5
60       5
100      5
94       5
59       4
83       4
98       3
112      3
64       2
88       2
3        2
6        2
7        2
89       2
24       2
113      2


# sample from processed CE

In [41]:
import numpy as np
import pandas as pd
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from collections import Counter
import csv
import codecs
import matplotlib.pyplot as plt

patterns = pd.read_csv('CE_extractor--Patterns_Based/CE_extractor/patterns.csv')
# give manov textbook_ce_processed, squad_ce_processed
# give katie sample 100 (2ppl/data) and sample 50% stratified sampling on typology

textbook_ce = pd.read_csv('textbook_ce_processed.csv')
textbook_ce = textbook_ce[['PatternID', 'Text','Cause', 'Effect']]
textbook_ce = pd.merge(textbook_ce, patterns, left_on="PatternID", right_on="pid")
textbook_ce.groupby(['table', 'line', 'col']).size()
# textbook_ce['PatternID'].value_counts().plot(kind='bar')
# plt.show()

def stratified_sample_df(df, cols, n_samples):
    n = min(n_samples, df[cols].value_counts().min())
    df_ = df.groupby(cols).apply(lambda x: x.sample(n))
    df_.index = df_.index.droplevel(0)
    return df_

# small_sample = stratified_sample_df(textbook_ce, ['table', 'line', 'col'], 100)
# small_sample

# textbook_small_sample = textbook_ce.groupby(['table', 'line', 'col']).apply(lambda x: x.sample(min(len(x), 100)))
# n = textbook_ce.shape[0]
# textbook_small_sample = textbook_ce.groupby(['table', 'line', 'col']).apply(lambda x: x.sample(frac=len(x)/n))
# textbook_small_sample = textbook_ce.sample(n=100, weights=[['table', 'line', 'col']])
N = 100

textbook_small_sample_init = textbook_ce.groupby(['table', 'line', 'col']).apply(lambda x: x.sample(min(len(x), 2)))
print(textbook_small_sample_init.groupby(['table', 'line', 'col']).size())

# textbook_ce_rest = textbook_ce[~textbook_ce.isin(textbook_small_sample_init)].dropna()
textbook_ce_rest = pd.concat([textbook_ce, textbook_small_sample_init, textbook_small_sample_init]).drop_duplicates(keep=False)

textbook_small_sample = textbook_ce_rest.groupby(['table', 'line', 'col']).apply(lambda x: x.sample(int(np.rint((N-len(textbook_small_sample_init))*len(x)/len(textbook_ce_rest))))).sample(frac=1)

textbook_small_sample = textbook_small_sample.append(textbook_small_sample_init, ignore_index=False)

print(textbook_ce.groupby(['table', 'line', 'col']).size())
print(textbook_small_sample.groupby(['table', 'line', 'col']).size())

textbook_small_sample = textbook_small_sample.sample(frac=1)[['Text','Cause', 'Effect']]
textbook_small_sample = textbook_small_sample.reset_index(level='table', drop=True)
textbook_small_sample = textbook_small_sample.reset_index(level='line', drop=True)
textbook_small_sample = textbook_small_sample.reset_index(level='col', drop=True)
print(len(textbook_small_sample))

manav_tce=textbook_small_sample.iloc[0:50]
katie_tce=textbook_small_sample.iloc[25:75]
emily_tce=textbook_small_sample.iloc[50:100]
tony_tce=textbook_small_sample.iloc[pd.np.r_[0:25, 75:100]]

manav_tce.to_csv("CausalQG/CSVs/annotations/manov_tce.csv")
katie_tce.to_csv("CausalQG/CSVs/annotations/katie_tce.csv")
emily_tce.to_csv("CausalQG/CSVs/annotations/emily_tce.csv")
tony_tce.to_csv("CausalQG/CSVs/annotations/tony_tce.csv")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emily/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
table  line  col
1      1     1      2
             2      2
             3      2
       2     1      2
2      1     1      2
3      1     1      2
       2     1      2
       3     1      2
4      1     1      2
             2      2
             3      2
             6      1
       2     1      2
             2      2
dtype: int64
table  line  col
1      1     1      1309
             2        28
             3        94
       2     1         3
2      1     1       470
3      1     1       977
       2     1        57
       3     1        70
4      1     1        99
             2        33
             3        29
             6         1
       2     1        20
             2       169
dtype: int64
table  line  col
1      1     1      31
             2       3
             3       4
 

Defaulting to column but this will raise an ambiguity error in a future version
Defaulting to column but this will raise an ambiguity error in a future version
Defaulting to column but this will raise an ambiguity error in a future version
Defaulting to column but this will raise an ambiguity error in a future version
Defaulting to column but this will raise an ambiguity error in a future version
Defaulting to column but this will raise an ambiguity error in a future version


In [42]:
test = pd.read_csv('CausalQG/CSVs/annotations/manov_tce.csv')
test
# manav_tce.index.names = ['table', 'line', 'col', 'index']
# manav_tce.reset_index(level='table', drop=True)
# manav_tce.reset_index(level='table', drop=True)

Unnamed: 0.1,Unnamed: 0,Text,Cause,Effect
0,2083,Purchase products made from recycled materials...,Reduce pollution,resources are maintained .
1,490,Gravity within such an enormous body squeezes ...,Earths internal pressure grew,its temperature also rose.
2,1907,"In fact, it wont stop moving unless another un...",moving to the other end and pushing in the opp...,"For example , Lauren can stop the rolling skat..."
3,3320,The infected person cannot fight infections an...,The infected person can not fight infections a...,people do not die from HIV .
4,1854,Some echinoderms find other ways of moving. Fo...,attaching to the sides of fish,some sea cucumbers move
5,656,Many parasitic diseases caused by roundworms r...,Contributing factors may include lack of a cle...,"inadequate sanitation measures , crowded livin..."
6,869,"Neptune, shown in Figure 1.1, is the only majo...",Uranus did not always appear exactly where it ...,Scientists predicted the existence of Neptune ...
7,3034,The thick atmosphere causes a strong greenhous...,thick atmosphere causes a strong greenhouse ef...,the hottest planet
8,485,"In the first few moments after the Big Bang, t...",the universe expanded,it became less dense and began to cool.
9,220,(Figure 1.5). Tourism is down in the region as...,beach goers find other ways to spend their time .,Tourism is down in the region
