In [1]:
import pandas as pd
import ast
from collections import Counter

In [2]:
pd.options.display.max_colwidth = 100

In [3]:
import spacy
import en_core_web_sm
from collections import OrderedDict

In [4]:
# Utils 

In [5]:
class PosCountsTask():
    def __init__(self,df):
        self.df = df
        self.nlp = en_core_web_sm.load()
        self.desired_pos = ["NOUN", "PROPN", "VERB", "ADJ"]
        self.grammar_pos = ["INTJ", "CCONJ", "AUX", "PUNCT","PART","SCONJ","DET","SYM", "NUM"]
        
    def make_pos_counts(self, text, pos_tag):
        docs = self.nlp(text)
        counts = Counter()
        for word in docs:
            counts[word.pos_] += 1
        # combine all counts of other grammar words than the desired list
        counts2 = Counter()
        for pos in counts.keys():
            if pos in self.grammar_pos:
                counts2["other_pos_counts"] += counts[pos]
        
        if pos_tag in counts2.keys():
            return counts2[pos_tag]
        elif pos_tag in counts.keys():
            return counts[pos_tag]
        else:
            return 0
    
    def make_counts_fields(self):
        df = self.df
        for pos in self.desired_pos:
            field_name = spacy.explain(pos) + "_counts"
            df[field_name] = df["claim"].apply(lambda x: self.make_pos_counts(x,pos))
            
        df["other_pos_counts"] = self.df["claim"].apply(lambda x: 
                                                           self.make_pos_counts(x,"other_pos_counts"))
        return df
        
    

# Reading the Train data and add support len

In [6]:
import json

In [7]:
train_df = pd.read_json('train_1.json', orient='index')

In [8]:
train_df.shape

(145449, 4)

In [9]:
train_df.sort_index().head(10)

Unnamed: 0,Support_length,claim,evidence,label
3,1,Chris Hemsworth appeared in A Perfect Getaway.,"[[Chris_Hemsworth, 2]]",SUPPORTS
4,0,Chris Hemsworth disappeared in A Perfect Getaway.,[],NOT ENOUGH INFO
7,1,Roald Dahl is a writer.,"[[Roald_Dahl, 0]]",SUPPORTS
8,1,Roald Dahl is a governor.,"[[Roald_Dahl, 0]]",REFUTES
9,1,Ireland has relatively low-lying mountains.,"[[Ireland, 10]]",SUPPORTS
10,1,Ireland does not have relatively low-lying mountains.,"[[Ireland, 10]]",REFUTES
13,0,David Thewis has had many notable performances.,[],NOT ENOUGH INFO
14,2,There have been many notable performances by David Thewlis.,"[[David_Thewlis, 2], [David_Thewlis, 1]]",SUPPORTS
17,1,Edward I of England responded to a second rebellion in 1282.,"[[Edward_I_of_England, 16]]",SUPPORTS
18,0,Edward I of England responded to a second rebellion in 1282 and he was a dictator.,[],NOT ENOUGH INFO


In [10]:
df = train_df[:100].copy()
pos_counts = PosCountsTask(df=df)
df = pos_counts.make_counts_fields()

In [11]:
df

Unnamed: 0,Support_length,claim,evidence,label,noun_counts,proper noun_counts,verb_counts,adjective_counts,other_pos_counts
75397,2,Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.,"[[Fox_Broadcasting_Company, 0], [Nikolaj_Coster-Waldau, 7]]",SUPPORTS,0,6,1,0,3
150448,2,Roman Atwood is a content creator.,"[[Roman_Atwood, 1], [Roman_Atwood, 3]]",SUPPORTS,2,2,1,0,2
214861,1,"History of art includes architecture, dance, sculpture, music, painting, poetry literature, thea...","[[History_of_art, 2]]",SUPPORTS,14,0,1,1,11
156709,1,Adrienne Bailon is an accountant.,"[[Adrienne_Bailon, 0]]",REFUTES,1,2,1,0,2
83235,0,System of a Down briefly disbanded in limbo.,[],NOT ENOUGH INFO,3,0,1,1,2
129629,2,Homeland is an American television spy thriller based on the Israeli television series Prisoners...,"[[Prisoners_of_War_-LRB-TV_series-RRB-, 0], [Homeland_-LRB-TV_series-RRB-, 0]]",SUPPORTS,5,3,2,2,3
149579,0,Beautiful reached number two on the Billboard Hot 100 in 2003.,[],NOT ENOUGH INFO,1,2,1,1,5
229289,0,Neal Schon was named in 1954.,[],NOT ENOUGH INFO,0,2,2,0,2
33078,1,The Boston Celtics play their home games at TD Garden.,"[[Boston_Celtics, 3]]",SUPPORTS,3,3,1,1,2
6744,2,The Ten Commandments is an epic film.,"[[The_Ten_Commandments_-LRB-1956_film-RRB-, 20], [The_Ten_Commandments_-LRB-1956_film-RRB-, 0]]",SUPPORTS,1,2,1,1,3
