In [1]:
import pandas as pd
import ast
from collections import Counter

In [2]:
pd.options.display.max_colwidth = 100

In [3]:
import spacy
import en_core_web_sm
from collections import OrderedDict

In [4]:
# Utils 

In [96]:
class BaseTask():
    def __init__(self):
        self.nlp = en_core_web_sm.load()
        
    def get_POS_tags_list(self, text):
        docs = self.nlp(text)
        pos_list = []
        for word in docs:
            pos_list.append((word.text,word.pos_))

        return pos_list
    def get_enitities_list(self, text):
        docs = nlp(text)
        entities_dict = {word: word.label_ for word in docs.ents}
        return entities_dict



In [97]:
class PosCountsTask(BaseTask):
    def __init__(self,df):
        self.df = df
        self.desired_pos = ["NOUN", "PROPN", "VERB", "ADJ"]
        self.grammar_pos = ["INTJ", "CCONJ", "AUX", "PUNCT","PART","SCONJ","DET","SYM", "NUM"]
        super().__init__()
        
    def make_pos_counts(self, text, pos_tag):
        docs = self.nlp(text)
        counts = Counter()
        for word in docs:
            counts[word.pos_] += 1
        # combine all counts of other grammar words than the desired list
        counts2 = Counter()
        for pos in counts.keys():
            if pos in self.grammar_pos:
                counts2["other_pos_counts"] += counts[pos]
        
        if pos_tag in counts2.keys():
            return counts2[pos_tag]
        elif pos_tag in counts.keys():
            return counts[pos_tag]
        else:
            return 0
    
    def make_counts_fields(self):
        df = self.df
        for pos in self.desired_pos:
            field_name = spacy.explain(pos) + "_counts"
            df[field_name] = df["claim"].apply(lambda x: self.make_pos_counts(x,pos))
            
        df["other_pos_counts"] = self.df["claim"].apply(lambda x: 
                                                           self.make_pos_counts(x,"other_pos_counts"))
        return df
        
    

In [140]:
class KeywordsAndEntityTask(BaseTask):
    def __init__(self,df):
        self.df = df
        self.key_pos = ["NOUN", "PROPN", "SYM", "NUM","ADJ"]
        super().__init__()
        
    def extract_keyword_list(self, claim):
        # Pick all the Entities and the key POS in the list 

        ents = self.get_enitities_list(claim)

        pos = self.get_POS_tags_list(claim)

        pos_filtered = [(key,value) for key,value in pos if value in self.key_pos]

        ents_list = list(ents.keys())
        ents_list = [str(item) for item in ents_list]
        # pos list of the entities
        pos_of_ents = get_POS_tags_list(" ".join(ents_list))
        pos_of_ents = [key for key,value in pos_of_ents]
        # list of all filtered pos
        pos_list = [key for key,value in pos_filtered] 
        # keywords not in entities
        other_keywords = [word for word in pos_list if word not in pos_of_ents]

        final_key_words = ents_list + other_keywords
        return final_key_words
    
    def caliculate_keywords_len(self):
        df = self.df
        
        df["keyword_count"] = df["claim"].apply(lambda x: len(self.extract_keyword_list(x)))
        return df 
    
    def keywords_similarity(self, claim, candidate_sent):
        # caliculate similarity b/w claim and candidate sentence using only thier key words
        clm = self.nlp(" ".join(self.extract_keyword_list(claim)))
        evdc = self.nlp(" ".join(self.extract_keyword_list(candidate_sent)))
        return clm.similarity(evdc)
        
    def common_keywords_count(self, claim, candidate_sent):
        clm = [item.lower().strip() for item in self.extract_keyword_list(claim)]
        evdc = [item.lower().strip() for item in self.extract_keyword_list(candidate_sent)]
        return len(list(set(evdc).intersection(clm)))
    
        
    

# Reading the Train data and add support len

In [130]:
import json

In [131]:
train_df = pd.read_json('dev_train_1.json', orient='index')

In [14]:
train_df.shape

(5001, 4)

In [16]:
train_df.sort_index().tail(10)

Unnamed: 0,Support_length,claim,evidence,label
228344,0,Island Records is a music school.,[],NOT ENOUGH INFO
228348,0,Island Records was reviewed by Chris Blackwell.,[],NOT ENOUGH INFO
228349,0,Island Records was founded by an American singer Graeme Goodall.,[],NOT ENOUGH INFO
228431,0,The Wallace (poem) was written by an English person.,[],NOT ENOUGH INFO
228432,1,The Wallace (poem) is historically accurate.,"[[The_Wallace_-LRB-poem-RRB-, 2]]",REFUTES
229305,3,A working animal is incapable of being trained.,"[[Working_animal, 0], [Working_animal, 1], [Working_animal, 21]]",REFUTES
229312,1,A working animal is wild only.,"[[Working_animal, 0]]",REFUTES
229316,0,A working animal is trained to perform life saving tasks.,[],NOT ENOUGH INFO
229317,1,A working animal is anything but an animal.,"[[Working_animal, 0]]",REFUTES
229319,1,A working animal is a living thing.,"[[Working_animal, 0]]",SUPPORTS


In [17]:

pos_counts = PosCountsTask(df=train_df)
df = pos_counts.make_counts_fields()

In [80]:
df.to_csv("initial_features-devset.csv")

In [78]:
key_counts = KeywordsAndEntityTask(df=df)
df = key_counts.caliculate_keywords_len()


In [135]:
key_counts = KeywordsAndEntityTask(df=df)

In [141]:
claim = "Chris Hemsworth appeared in A Perfect Getaway."
evidence = "Hemsworth has also appeared in the science fiction action film Star Trek -LRB- 2009 -RRB- , the thriller adventure A Perfect Getaway -LRB- 2009 -RRB- , the horror comedy The Cabin in the Woods -LRB- 2012 -RRB- , the dark-fantasy action film Snow White and the Huntsman -LRB- 2012 -RRB- , the war film Red Dawn -LRB- 2012 -RRB- , and the biographical sports drama film Rush -LRB- 2013 -RRB- ."

In [142]:
key_counts.keywords_similarity(claim,evidence)

0.5100020158785087

In [144]:
key_counts.common_keywords_count(claim,evidence)

2

In [145]:
key_counts.extract_keyword_list(claim)

['Chris Hemsworth', 'Perfect', 'Getaway']

In [147]:
key_counts.get_POS_tags_list("A working animal is incapable of being trained.")

[('A', 'DET'),
 ('working', 'VERB'),
 ('animal', 'NOUN'),
 ('is', 'VERB'),
 ('incapable', 'ADJ'),
 ('of', 'ADP'),
 ('being', 'VERB'),
 ('trained', 'VERB'),
 ('.', 'PUNCT')]