### Import

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
import os
from itertools import chain
import ast
from nltk.stem import PorterStemmer

from nltk.corpus import wordnet as wn
nltk.download('averaged_perceptron_tagger')

import inflect
p = inflect.engine()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\thrdl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Code

In [2]:
def get_words(df):
    sentences = df.text.to_list()
    
    flat_list = []
    for sublist in sentences:
        for item in sublist:
            flat_list.append(item)
    return flat_list

In [3]:
def get_nouns(words):
    nouns = []
    porter = PorterStemmer()
    for word, tag in words:
        if tag.startswith('NN'):
            if p.singular_noun(word):
                nouns.append(p.singular_noun(word)) #plural nouns to singular nouns
            else:
                nouns.append(word)
            #nouns.append(porter.stem(word)) #convert words to their basic form
    return nouns

In [4]:
def get_frequency(words):
    dict_of_words = {}
    
    for word in words:
        if word in dict_of_words:
            dict_of_words[word] += 1
        else:
            dict_of_words[word] = 1
    return dict_of_words

In [5]:
def get_used_noun_frequency(dic_1, dic_2, threshold_significance = 10):
    significant_nouns = {}
    for key, val in dic_1.items():
        significant_nouns[key] = [val, 0]
    
    for key, val in dic_2.items():
        if key in significant_nouns:
            significant_nouns[key] = [significant_nouns[key][0], val]
        else:
            significant_nouns[key] = [0, val]
            
    for key, val in significant_nouns.copy().items():
        if (val[0] < threshold_significance) and (val[1] < threshold_significance):
            del significant_nouns[key]
        
    print(f"number of significant nouns: {len(significant_nouns.keys())}")
    return significant_nouns

In [6]:
def get_imbalanced_nouns(dic, coef = 1.8):
    nouns_of_interest = []
    
    for key, val in dic.items():
        if max(val[0], val[1]) - min(val[0], val[1]) * coef > 0:
            nouns_of_interest.append(key)
    print(f"number of nouns of interest: {len(nouns_of_interest)}")
    return nouns_of_interest

In [7]:
def get_nouns_and_their_synonims(nouns_of_interest, dic_all_nouns, threshold = 0.92):
    dic_of_noun_differences = {}
    
    for noun in nouns_of_interest:

            word_meanings = wn.synsets(noun, 'n')
            for word_meaning in word_meanings:
                for key, val in dic_all_nouns.items():
                    if key != noun:

                            word_meanings_target = wn.synsets(key, 'n')
                            for word_meaning_target in word_meanings_target:
                                if word_meaning.wup_similarity(word_meaning_target) > threshold:
                                    if noun not in dic_of_noun_differences:
                                        dic_of_noun_differences[noun] = set()
                                        dic_of_noun_differences[noun].add(key)
                                    else:
                                        dic_of_noun_differences[noun].add(key)
    return dic_of_noun_differences

In [8]:
def nouns_synonyms_set_of_significance(dic_of_noun_differences, dic_noun_frequency):
    for key, val in dic_of_noun_differences.items():
        #key = word, val = set of synonyms
        for noun in val.copy():
            if (dic_noun_frequency[key][0] >= dic_noun_frequency[key][1] and dic_noun_frequency[noun][0] >= dic_noun_frequency[noun][1]) or (dic_noun_frequency[key][0] <= dic_noun_frequency[key][1] and dic_noun_frequency[noun][0] <= dic_noun_frequency[noun][1]):
                val.remove(noun)
    return dic_of_noun_differences

In [9]:
def print_nouns_stats(dic_of_noun_differences, dic_noun_frequency):
    for key, val in dic_of_noun_differences.items():
        for noun in val:
            print(f"{key}: {dic_noun_frequency[key][0]} - {dic_noun_frequency[key][1]}, {noun}: {dic_noun_frequency[noun][0]} - {dic_noun_frequency[noun][1]}")

In [10]:
for pair in [["abortion_pro_choice.csv", "abortion_pro_life.csv"]]:
    df_1 = pd.read_csv(f"./dataset_processed/{pair[0]}", converters={2:ast.literal_eval})
    df_2 = pd.read_csv(f"./dataset_processed/{pair[1]}", converters={2:ast.literal_eval})
    
    get_words_1 = get_words(df_1)
    get_words_2 = get_words(df_2)
    print(get_words_1[0:5])
    print(get_words_2[0:5])
    
    
    tags_1 = nltk.pos_tag(get_words_1)
    tags_2 = nltk.pos_tag(get_words_2)
    
    nouns_1 = get_nouns(tags_1)
    nouns_2 = get_nouns(tags_2)
    
    frequency_nouns_1 = get_frequency(nouns_1)
    frequency_nouns_2 = get_frequency(nouns_2)
    
    significance_nouns = get_used_noun_frequency(frequency_nouns_1, frequency_nouns_2)
    
    imbalanced_nouns = get_imbalanced_nouns(significance_nouns)
    
    dic_of_noun_differences = get_nouns_and_their_synonims(imbalanced_nouns, significance_nouns)
    

    dic_of_noun_differences = nouns_synonyms_set_of_significance(dic_of_noun_differences, significance_nouns)
    
    print_nouns_stats(dic_of_noun_differences, significance_nouns)
    print("########################################")

['the', 'government', 'has', 'no', 'place']
['i', 'choose', 'life', 'reagan', 'once']
number of significant nouns: 254
number of nouns of interest: 83
decision: 109 - 56, result: 10 - 12
theory: 10 - 5, conception: 36 - 49
form: 11 - 24, word: 26 - 20
form: 11 - 24, person: 366 - 335
form: 11 - 24, sort: 11 - 3
form: 11 - 24, state: 58 - 37
form: 11 - 24, body: 122 - 105
form: 11 - 24, course: 33 - 18
form: 11 - 24, type: 12 - 4
form: 11 - 24, kind: 12 - 10
order: 11 - 6, act: 9 - 11
movement: 12 - 6, action: 25 - 31
attempt: 11 - 4, crime: 14 - 21
constitution: 22 - 10, beginning: 2 - 14
course: 33 - 18, form: 11 - 24
course: 33 - 18, action: 25 - 31
organism: 10 - 22, person: 366 - 335
organism: 10 - 22, animal: 31 - 19
organism: 10 - 22, parent: 68 - 45
organism: 10 - 22, individual: 30 - 25
term: 41 - 18, subject: 7 - 12
type: 12 - 4, form: 11 - 24
job: 17 - 5, place: 33 - 34
sort: 11 - 3, form: 11 - 24
being: 6 - 20, person: 366 - 335
being: 6 - 20, animal: 31 - 19
being: 6 - 20, 