In [1]:
import glob
import pandas as pd
import udapi
import numpy as np
import matplotlib.pyplot as plt

#### 1. Retrives HDTB treebank data

In [2]:
''' takes conllu files and creates a list of tokens using UDAPI library '''

def get_data(files):
    node_list = []
    for file in files:
        doc = udapi.Document(file)
        for node in doc.nodes:
            node_list.append(node)
    return node_list

files = glob.glob("/NLU22/hindi_treebank/*.conllu", recursive=True)
data = get_data(files)

#### 2. Retrive nouns occuring with 'do' and 'be' verb in Hindi
1. creates list of all nouns
2. creates dictionary with noun and its corpous frequency with a particular verb

In [3]:
''' takes list of nodes and returns two list of all nouns occuring with 1) /kar/ and 2) /ho/ in 
    compound relation here vnode is used to call all verb nodes and nnode for noun nodes '''

def verb_noun_compound(dataset):
    nk_list = []
    nh_list = []
    
    for vnode in data:
        if vnode.upos == "VERB" and vnode.lemma =='कर':
    #         print(vnode)
            for nnode in vnode.children:
                if nnode.deprel == 'compound' and nnode.xpos == 'NN':
    #                 print(nnode, vnode)
                    nk_list.append(nnode.form)
        elif vnode.upos == "VERB" and vnode.lemma =='हो':
    #         print(vnode)
            for nnode in vnode.children:
                if nnode.deprel == 'compound' and nnode.xpos == 'NN':
    #                 print(nnode, vnode)
                    nh_list.append(nnode.form)
    return nk_list, nh_list

In [4]:
''' takes list of nouns and returns a dictionary with each unique noun as a key and 
    its frequenct in list 'noun_list' as value '''

def get_frequency(noun_list):
    n_dict = {}
    for noun in noun_list:
        n_dict[noun] = n_dict.get(noun, 0) + 1
    return(n_dict)

In [5]:
nk_list, nh_list = verb_noun_compound(data)
nk_dict = get_frequency(nk_list)
nh_dict = get_frequency(nh_list)
print('no. of nouns occuring with /kar/:', len(nk_list))
print('no. of nouns occuring with /ho/ :' , len(nh_list))
print('unique nouns occuring with /kar/:', len(nk_dict))
print('unique nouns occuring with /ho/ :' , len(nh_dict))

no. of nouns occuring with /kar/: 4183
no. of nouns occuring with /ho/ : 792
unique nouns occuring with /kar/: 616
unique nouns occuring with /ho/ : 295


#### 3. Creates a df
1. calculates the C/A ratio
2. sort the df according to C/A ratio in ascending order
3. filters cases where both the causative and anticausative are more than 1

In [6]:
''' creates a df using dictionary nk_dict with keys in Noun column and value in Causative'''

df = pd.DataFrame(nk_dict.items(), columns=['Noun', 'Causative'])
df.head()

Unnamed: 0,Noun,Causative
0,वर्णन,1
1,खोज,5
2,सैर,5
3,प्रवेश,19
4,स्वागत,33


In [7]:
''' calculates causative / anticausative ratio over corpus frequency for each row of the df
    and appends value in new column 'C/A' '''

df['Anticausative'] = df['Noun'].map(nh_dict) ### adds the anticausative to the df by mapping dictionary

df_filtered = df.dropna() ### drops any na cases

df_filtered['C/A'] = df_filtered['Causative'] / df_filtered['Anticausative']

df_filtered.describe()
#df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Causative,Anticausative,C/A
count,133.0,133.0,133.0
mean,16.676692,3.488722,8.410878
std,26.341737,10.362824,15.832311
min,1.0,1.0,0.166667
25%,2.0,1.0,1.0
50%,6.0,1.0,3.0
75%,20.0,3.0,7.0
max,158.0,117.0,105.0


In [8]:
''' sorts the df in ascending order by 'C/A' column '''

df_filtered.sort_values("C/A", axis = 0, ascending = True, inplace = True, na_position ='last')
df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Noun,Causative,Anticausative,C/A
255,अहसास,1,6.0,0.166667
549,देरी,1,5.0,0.2
537,प्रारंभ,1,4.0,0.25
508,बहस,1,4.0,0.25
403,शक,1,3.0,0.333333


In [9]:
'''find cases where the frequency of both causative and anticausative is more than 1'''

df_find = df_filtered.copy(deep=False) ### creates a copy of original df

df_find = df_find.loc[(df['Causative'] >1) & (df_find['Anticausative']>1)] ### locate all rows given condition

print(df_find.describe())

df_find.head()

        Causative  Anticausative        C/A
count   54.000000      54.000000  54.000000
mean    25.962963       6.777778   5.693458
std     31.786412      15.740426   6.324489
min      2.000000       2.000000   0.400000
25%      6.000000       2.000000   1.602273
50%     13.000000       3.500000   3.416667
75%     28.000000       5.750000   6.954545
max    158.000000     117.000000  31.500000


Unnamed: 0,Noun,Causative,Anticausative,C/A
126,वापसी,2,5.0,0.4
42,हाथ,2,5.0,0.4
278,आरंभ,2,4.0,0.5
493,समाधान,2,3.0,0.666667
318,वृद्धि,6,9.0,0.666667


#### 4. Corpus Analysis
1. finds the frequency of nouns from the filtered df in corpus for all verbs
2. finds the frequency of nouns from the filtered df in corpus for all verbs except 'do', 'give', 'take'
3. the frequency of nouns from the filtered df where the subject was agentive

In [10]:
''' takes the list of nodes from treebank and the df, matches noun column from df with the list of nodes,
    and if noun is in list of nodes and has 'compound' as relation creates a dictionary with matched
    noun as key and its frequency from the corpus as value'''

def corpus_freq_nouns(dataset,df):
    corpus_noun_freq = {}

    for noun in df.Noun:
        for node in dataset:
            if node.lemma == noun and node.deprel == 'compound' and node.xpos == 'NN':
    #             print(node, node.parent, dd)
                if noun in corpus_noun_freq:
                    corpus_noun_freq[noun]+=1
                else:
                    corpus_noun_freq[noun]= 1
            else:
                if noun in corpus_noun_freq:
                    corpus_noun_freq[noun]+=0
                else:
                    corpus_noun_freq[noun]= 0
    return corpus_noun_freq

In [11]:
''' takes the list of nodes from treebank and finds all nouns that have compound as deprel and xpos as NN
    except for frequents in list /kar/, /de/, /le/ returns a dictionary of the nouns and their frequnecy'''

def filter_frequent_verbs(dataset):
    corpus_noun_filter = []
    freq_filter = {}
    
    verb_list = ['कर', 'दे', 'ले']
    
    for vnode in dataset:
        if vnode.upos == "VERB" and vnode.lemma not in verb_list:
            for nnode in vnode.children:
                if nnode.deprel == 'compound' and nnode.xpos == 'NN':
    #                 print(nnode, vnode)
                    corpus_noun_filter.append(nnode.form)
#     corpus_noun_filter

    
    for noun in corpus_noun_filter:
        freq_filter[noun] = freq_filter.get(noun, 0) + 1
#     len(freq_filter)
    return freq_filter

In [12]:
''' takes the list of nodes from treebank and the df, matches noun column from df with the list of nodes,
    and if noun is in list of nodes and has 'compound' as relation; filters all cases where the subject 
    is agentive and  creates a dictionary with matched noun as key and its frequency from the corpus as value '''

def agentive_nouns(dataset, df):
    corpus_AgentNoun_freq = {}
    for noun in df.Noun:
        for node in dataset:
            if node.lemma == noun and node.deprel == 'compound' and node.xpos == 'NN':
                args = node.parent.descendants
                for arg in args:
                    if arg.deprel == 'nsubj' and arg.next_node.form == 'ने':
    #                     print(node, arg.deprel, arg.next_node)
                        if noun in corpus_AgentNoun_freq:
                            corpus_AgentNoun_freq[noun]+=1
                        else:
                            corpus_AgentNoun_freq[noun]= 1
                    else:
                        if noun in corpus_AgentNoun_freq:
                            corpus_AgentNoun_freq[noun]+=0
                        else:
                            corpus_AgentNoun_freq[noun]= 0
                            
    return corpus_AgentNoun_freq

In [13]:
corpus_noun_freq = corpus_freq_nouns(data,df_find)
filtered_verb_noun = filter_frequent_verbs(data)
corpous_agentive_subject = agentive_nouns(data, df_find)
# 
print('corpus frequency for ',len(corpus_noun_freq),' predicative nouns')
print('predicative nouns (filtered for frequent verbs): ', len(filtered_verb_noun))
print('corpus frquency of nouns occuring with agentive subject: ', len(corpous_agentive_subject))

corpus frequency for  54  predicative nouns
predicative nouns (filtered for frequent verbs):  749
corpus frquency of nouns occuring with agentive subject:  53


#### 5. Calculates the ratio of:
1. corpus frequency of agentive nouns / corpus frequency of predicative nouns (for all the verbs) as A/P_corpus
2. corpus frequency of agentive nouns / corpus frequency of predicative nouns (for filtered verbs) as A/P_filter

In [14]:
''' calculates ratio of corpus frequency of agentive nouns by corpus frequency of predicative nouns for
    all the verbs and appends to df_find in new column 'A/P_corpus' '''

a_ratio = {} 

for item in corpous_agentive_subject:
    try:
        a_ratio[item] = corpous_agentive_subject[item] / corpus_noun_freq[item]
    except ZeroDivisionError:
        a_ratio[x] = 0
        
# print(a_ratio)

df_find['A/P_corpus'] = df_find['Noun'].map(a_ratio) ### maps dictionary key to noun column and adds value accordingly

df_find.head()

Unnamed: 0,Noun,Causative,Anticausative,C/A,A/P_corpus
126,वापसी,2,5.0,0.4,0.142857
42,हाथ,2,5.0,0.4,0.130435
278,आरंभ,2,4.0,0.5,
493,समाधान,2,3.0,0.666667,0.0
318,वृद्धि,6,9.0,0.666667,0.066667


In [15]:
''' calculates ratio of corpus frequency of agentive nouns by corpus frequency of filtered verbs predicative nouns 
    and appends to new column names 'A/P filter' '''

s_ratio = {}

for item in corpous_agentive_subject:
    try:
        s_ratio[item] = corpous_agentive_subject[item] / filtered_verb_noun[item]
    except ZeroDivisionError:
        s_ratio[x] = 0
        
# print(s_ratio)

df_find['A/P_filter'] = df_find['Noun'].map(s_ratio)

df_find.head()

Unnamed: 0,Noun,Causative,Anticausative,C/A,A/P_corpus,A/P_filter
126,वापसी,2,5.0,0.4,0.142857,0.2
42,हाथ,2,5.0,0.4,0.130435,0.166667
278,आरंभ,2,4.0,0.5,,
493,समाधान,2,3.0,0.666667,0.0,0.0
318,वृद्धि,6,9.0,0.666667,0.066667,0.111111


In [16]:
### sorts df on C/A ratio

df_find.sort_values("C/A", axis = 0, ascending = True, inplace = True, na_position ='last')
df_find.head()

Unnamed: 0,Noun,Causative,Anticausative,C/A,A/P_corpus,A/P_filter
126,वापसी,2,5.0,0.4,0.142857,0.2
42,हाथ,2,5.0,0.4,0.130435,0.166667
278,आरंभ,2,4.0,0.5,,
493,समाधान,2,3.0,0.666667,0.0,0.0
318,वृद्धि,6,9.0,0.666667,0.066667,0.111111


In [84]:
# df_find.to_csv('project_results.csv', index= True)