In [79]:
import pandas as pd
import utils

In [80]:
# Conjunctions under analysis
CONJUNCTIONS = ['and', 'or', 'but', 'nor']

# Categories under analysis
NOUN_CATEGORIES = ['NN', 'NNS', 'NNP', 'NNPS', 'NP', 'NX']
VERB_CATEGORIES = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'VP']
ADJ_CATEGORIES = ['JJ', 'JJR', 'JJS', 'ADJP']
ADV_CATEGORIES = ['RB', 'RBR', 'RBS', 'ADVP']

PHRASAL_CATEGORIES = ['NP', 'VP', 'ADJP', 'ADVP']

In [81]:
def likes_df(df):
    '''
    Returns a DataFrame of the like coordinations contained in the
    given DataFrame.

    Keyword Arguments:
        df -- DataFrame containing coordinations
    Return:
        Dataframe of like coordinations
    '''
    
    nouns = df[(df['1st Conjunct Category'].isin(NOUN_CATEGORIES)) & (
        df['2nd Conjunct Category'].isin(NOUN_CATEGORIES))]

    verbs = df[(df['1st Conjunct Category'].isin(VERB_CATEGORIES)) & (
        df['2nd Conjunct Category'].isin(VERB_CATEGORIES))]

    adjps = df[(df['1st Conjunct Category'].isin(ADJ_CATEGORIES)) & (
        df['2nd Conjunct Category'].isin(ADJ_CATEGORIES))]

    advps = df[(df['1st Conjunct Category'].isin(ADV_CATEGORIES)) & (
        df['2nd Conjunct Category'].isin(ADV_CATEGORIES))]

    likes = pd.concat([nouns, verbs, adjps, advps],
                      axis=0, ignore_index=True)

    return likes


def unlikes_df(df):
    '''
    Returns a DataFrame of the unlike coordinations contained in the
    given DataFrame.

    Keyword Arguments:
        df -- DataFrame containing coordinations
    Return:
        Dataframe of unlike coordinations
    '''

    df = df[df['1st Conjunct Category'].isin(PHRASAL_CATEGORIES)]
    df = df[df['2nd Conjunct Category'].isin(PHRASAL_CATEGORIES)]

    # Get unlike category combinations
    unlikes = df.loc[df['1st Conjunct Category']
                     != df['2nd Conjunct Category']]

    return unlikes


def filter_conjlength(df, length):
    '''
    Returns a DataFrame of coordinations contained in the given
    DataFrame where each conjunct is at most the given length.

    Keyword Arguments:
        df -- DataFrame containing coordinations
        length -- integer length to filter coordinations
    Return:
        Dataframe of filtered coordinations
    '''
    df['Sentence Text'] = df['Sentence Text'].astype('str')
    mask1 = df['1st Conjunct Text'].str.split().str.len()
    mask2 = df['2nd Conjunct Text'].str.split().str.len()
    return df.loc[(mask1 <= length) & (mask2 <= length)]

In [82]:
# Load CSV file with coordination samples
samples = pd.read_csv("csv/samples.csv", index_col=None, header=0)

# Load CSV files with raters' judgments
rater1 = pd.read_csv("csv/raters/rater1.csv", index_col=None, header=0)
rater2 = pd.read_csv("csv/raters/rater2.csv", index_col=None, header=0)
rater3 = pd.read_csv("csv/raters/rater3.csv", index_col=None, header=0)

# Take majority of three raters' judgments
r1 = rater1['Correct?']
r2 = rater2['Correct?']
r3 = rater3['Correct?']
samples['Correct? (Majority)'] = r1 & r2 & r3
correct = samples[samples['Correct? (Majority)']]
correct.to_csv('csv/correct_samples.csv')

In [85]:
likes = likes_df(correct)
unlikes = unlikes_df(correct)

likes = filter_conjlength(likes, 5)
unlikes = filter_conjlength(unlikes, 5)

unlikes

Unnamed: 0,1st Conjunct Category,1st Conjunct Text,2nd Conjunct Category,2nd Conjunct Text,Conjunction,Sentence Text,Sentence Parse Tree,uid,Correct? (Majority)
8,NP,taxes for rich people,VP,being over - taxed,or,"say something like , you 're not worried about...",(S (VP (VB say) (NP (NP (NN something)) (SBAR ...,8,True
14,NP,families in Cite Soleil,VP,held there,and,"Most kidnapping victims , families in Cite Sol...",(NP (NP (JJS Most) (NN kidnapping) (NNS victim...,14,True
17,NP,lots of it,VP,quickly done,and,"It comes to me , too , that sex , lots of it a...",(S (NP (NP (PRP It))) (VP (VBZ comes) (PP (TO ...,17,True
19,NP,nothing,VP,criticize Bush administration foreign policy,but,"I find the whole argument relatively amusing ,...",(S (S (NP (PRP I)) (VP (VBP find) (S (NP (DT t...,19,True
22,NP,coastline erosion,VP,filter river - borne debris,and,"The interlocking roots of these trees , which ...",(S (NP (NP (DT The) (VBG interlocking) (NNS ro...,22,True
...,...,...,...,...,...,...,...,...,...
2223,ADVP,globally,ADJP,locally invasive,or,"However , greater species richness does not al...","(S (ADVP (RB However)) (, ,) (NP (JJR greater)...",2223,True
2230,ADVP,Not a hundred pages through,ADJP,dead sick of whales,and,Not a hundred pages through and dead sick of w...,(S (UCP (ADVP (NP (QP (RB Not) (DT a) (CD hund...,2230,True
2232,ADVP,First,ADJP,most important,and,"First and most important , no child left behin...",(S (UCP (ADVP (JJ First)) (CC and) (ADJP (RBS ...,2232,True
2241,ADVP,down,ADJP,bare - bones,and,Recruiting trips to New England were stripped ...,(S (NP (NP (NN Recruiting) (NNS trips)) (PP (T...,2241,True
