In [1]:
import pandas as pd
import numpy as np
import ast

from collections import Counter, defaultdict 

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn
nltk.download('averaged_perceptron_tagger')


from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

import inflect
p = inflect.engine()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def get_words(df):
    sentences = df.text.to_list()    
    flat_list = []
    for sublist in sentences:
        for item in sublist:
            flat_list.append(item)           
    return flat_list

def get_nouns(words):
    nouns = []
    porter = PorterStemmer()
    for word, tag in words:
        if tag.startswith('NN'):
            if p.singular_noun(word):
                nouns.append(p.singular_noun(word)) #plural nouns to singular nouns
            else:
                nouns.append(word)
            #nouns.append(porter.stem(word)) #convert words to their basic form
    return nouns

def bow(sentence, corpus):
    bag_vector = np.zeros(len(corpus))
    for w in sentence:            
        for i,word in enumerate(corpus):
            if word == w:
                bag_vector[i] += 1
    return pd.Series(bag_vector)

def get_used_noun_frequency(dic_1, dic_2, threshold_significance = 10):
    significant_nouns = {}
    for key, val in dic_1.items():
        significant_nouns[key] = [val, 0]
    
    for key, val in dic_2.items():
        if key in significant_nouns:
            significant_nouns[key] = [significant_nouns[key][0], val]
        else:
            significant_nouns[key] = [0, val]
            
    for key, val in significant_nouns.copy().items():
        if (val[0] < threshold_significance) and (val[1] < threshold_significance):
            del significant_nouns[key]
        
    print(f"number of significant nouns: {len(significant_nouns.keys())}")
    return significant_nouns

def get_imbalanced_nouns(dic, coef = 2.2):
    nouns_of_interest = []
    
    for key, val in dic.items():
        if max(val[0], val[1]) - min(val[0], val[1]) * coef > 0:
            nouns_of_interest.append(key)
    print(f"number of nouns of interest: {len(nouns_of_interest)}")
    return nouns_of_interest

def get_nouns_and_their_synonims(nouns_of_interest, dic_all_nouns, threshold = 0.9):
    dic_of_noun_differences = {}
    
    for noun in nouns_of_interest:

            word_meanings = wn.synsets(noun, 'n')
            for word_meaning in word_meanings:
                for key, val in dic_all_nouns.items():
                    if key != noun:

                            word_meanings_target = wn.synsets(key, 'n')
                            for word_meaning_target in word_meanings_target:
                                if word_meaning.wup_similarity(word_meaning_target) > threshold:
                                    if noun not in dic_of_noun_differences:
                                        dic_of_noun_differences[noun] = set()
                                        dic_of_noun_differences[noun].add(key)
                                    else:
                                        dic_of_noun_differences[noun].add(key)
    return dic_of_noun_differences

def nouns_synonyms_set_of_significance(dic_of_noun_differences, dic_noun_frequency):
    for key, val in dic_of_noun_differences.items():
        #key = word, val = set of synonyms
        for noun in val.copy():
            if (dic_noun_frequency[key][0] >= dic_noun_frequency[key][1] and dic_noun_frequency[noun][0] >= dic_noun_frequency[noun][1]) or (dic_noun_frequency[key][0] <= dic_noun_frequency[key][1] and dic_noun_frequency[noun][0] <= dic_noun_frequency[noun][1]):
                val.remove(noun)
    return dic_of_noun_differences

def print_nouns_stats(dic_of_noun_differences, dic_noun_frequency):
    for key, val in dic_of_noun_differences.items():
        for noun in val:
            print(f"{key}: {dic_noun_frequency[key][0]} - {dic_noun_frequency[key][1]}, {noun}: {dic_noun_frequency[noun][0]} - {dic_noun_frequency[noun][1]}")

In [3]:
def filter_dict(freq_dict, thr):
     return {x : freq_dict[x] for x in freq_dict.keys() if freq_dict[x] >= thr}

def optimal_threshold(freq_dict, plot=False):
    
    opt_thr, n_words = 0, len(freq_dict)
    
    if plot:
        fig = plt.figure(figsize=(15, 10))
        fig.subplots_adjust(hspace=0.75, wspace=0.5)
    
    
    for ix, thr in enumerate(range(2, 11)):
        fil_frequency = filter_dict(freq_dict, thr)
        
        if (n_words - len(fil_frequency)) >= 50:
            n_words = len(fil_frequency)
            opt_thr = thr 
        else:
            n_words = len(fil_frequency)
        
        if plot:
            x = np.arange(len(fil_frequency))
            y = np.array(list(fil_frequency.values()))  
            ax = fig.add_subplot(5, 2, ix+1)
            ax.plot(x, y)
            ax.set_title('Threshold = '+ str(thr))
            ax.set_ylabel('frequency')
            ax.set_xlabel('# of words')
    
    if plot:
        plt.show()
    
    return opt_thr

In [4]:
def lr_model(X_train, y_train):
    pipe = Pipeline([('classifier' , LogisticRegression())])

    param_grid = [
        {'classifier' : [LogisticRegression()],
         'classifier__penalty' : ['l1', 'l2'],
        'classifier__C' : np.logspace(-4, 4, 20),
        'classifier__solver' : ['liblinear']}
    ]

    clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, n_jobs=-1)

    best_clf = clf.fit(X_train, y_train)
    return best_clf

In [5]:
def threshold_similarity(word_meanings_1, word_meanings_2, threshold):
    for meaning_1 in word_meanings_1:
        for meaning_2 in word_meanings_2:
            sim = meaning_1.wup_similarity(meaning_2)
            if sim > threshold:
                return True
    return False

In [6]:
def noun_disparity(noun_1, noun_2, freq_dict_1, freq_dict_2, threshold):
        
    freq_1_1 = freq_dict_1[noun_1]/(freq_dict_1[noun_1] + freq_dict_2[noun_1])
    freq_1_2 = freq_dict_2[noun_1]/(freq_dict_1[noun_1] + freq_dict_2[noun_1])
    
    freq_2_1 = freq_dict_1[noun_2]/(freq_dict_1[noun_2] + freq_dict_2[noun_2])
    freq_2_2 = freq_dict_2[noun_2]/(freq_dict_1[noun_2] + freq_dict_2[noun_2])
    
    if ((freq_1_1 - freq_1_2) > threshold and (freq_2_2 - freq_2_1) > threshold) or ((freq_1_2 - freq_1_1) > threshold and (freq_2_1 - freq_2_2) > threshold):
            return True
    return False
        

In [7]:
def get_similar_pairs(freq_dict_1, freq_dict_2):
    pairs = []

    for noun_1 in freq_dict_1.keys():
        word_meanings_1 = wn.synsets(noun_1, 'n')
        for noun_2 in freq_dict_2.keys():
            if noun_1 != noun_2:
                word_meanings_2 = wn.synsets(noun_2, 'n')
                if threshold_similarity(word_meanings_1, word_meanings_2, 0.95) and noun_disparity(noun_1, noun_2, fil_frequency_nouns_1, fil_frequency_nouns_2, 0.10):
                    if (noun_1, noun_2) not in pairs and (noun_2, noun_1) not in pairs:
                        pairs.append((noun_1, noun_2))
    
    return pairs

In [8]:
topics = { "abortion": ["abortion_pro_choice.csv", "abortion_pro_life.csv"], 
           "gay_marriage":["gay_marriage_for.csv", "gay_marriage_against.csv"],
           "darwin_theory_of_evolution" :["darwin_theory_of_evolution_for.csv", "darwin_theory_of_evolution_against.csv"],
         }

for key, pair in topics.items():

    df_1 = pd.read_csv(f"./dataset_processed/{pair[0]}", converters={2:ast.literal_eval})
    df_2 = pd.read_csv(f"./dataset_processed/{pair[1]}", converters={2:ast.literal_eval})

    tr_1 = df_1.sample(frac=0.8, random_state= 34)
    tr_2 = df_2.sample(frac=0.8, random_state= 34)

    words_1 = get_words(tr_1)
    words_2 = get_words(tr_2)

    tags_1 = nltk.pos_tag(words_1)
    tags_2 = nltk.pos_tag(words_2)

    nouns_1 = get_nouns(tags_1)
    nouns_2 = get_nouns(tags_2)

    frequency_nouns_1 = dict(Counter(nouns_1).most_common())
    frequency_nouns_2 = dict(Counter(nouns_2).most_common()) 

    common_1 = list(frequency_nouns_1.keys())[:20]
    common_2 = list(frequency_nouns_1.keys())[:20]

    common = list(set([w for w in common_1] + [w for w in common_2]))

    tr_1['bow_tokens_common'] = tr_1.apply(lambda t: bow(t.text, common), axis=1).values.tolist()
    tr_1[common] = pd.DataFrame(tr_1.bow_tokens_common.values.tolist(), index= tr_1.index)

    tr_2['bow_tokens_common'] = tr_2.apply(lambda t: bow(t.text, common), axis=1).values.tolist()
    tr_2[common] = pd.DataFrame(tr_2.bow_tokens_common.values.tolist(), index= tr_2.index)

    X_train_1 = np.append(tr_1[common], tr_2[common], axis = 0)
    y_train_1 = np.append(np.zeros(len(tr_1)), np.ones(len(tr_2)))
    
    model_1 = lr_model(X_train_1, y_train_1)
    print(f"Training Accuracy LR {key} - Common Words: {model_1.score(X_train_1, y_train_1):.2f}")
    
    thr_1 = optimal_threshold(frequency_nouns_1)
    thr_2 = optimal_threshold(frequency_nouns_2)
    thr = max(thr_1, thr_2)

    fil_frequency_nouns_1 = filter_dict(frequency_nouns_1, thr)
    fil_frequency_nouns_2 = filter_dict(frequency_nouns_2, thr)
    
    for word in fil_frequency_nouns_1.keys():
        if word not in fil_frequency_nouns_2:
            fil_frequency_nouns_2[word] = 0
        
    for word in fil_frequency_nouns_2.keys():
        if word not in fil_frequency_nouns_1:
            fil_frequency_nouns_1[word] = 0
            
    pairs = get_similar_pairs(fil_frequency_nouns_1, fil_frequency_nouns_2)
    sim_words = list(set([ f1 for f1,f2 in pairs] + [ f2 for f1,f2 in pairs]))
    
    tr_1['bow_tokens_sim'] = tr_1.apply(lambda t: bow(t.text, sim_words), axis=1).values.tolist()
    tr_1[sim_words] = pd.DataFrame(tr_1.bow_tokens_sim.values.tolist(), index= tr_1.index)

    tr_2['bow_tokens_sim'] = tr_2.apply(lambda t: bow(t.text, sim_words), axis=1).values.tolist()
    tr_2[sim_words] = pd.DataFrame(tr_2.bow_tokens_sim.values.tolist(), index= tr_2.index)
    
    X_train_2 = np.append(tr_1[sim_words], tr_2[sim_words], axis = 0)
    y_train_2 = np.append(np.zeros(len(tr_1)), np.ones(len(tr_2)))
    
    model_2 = lr_model(X_train_2, y_train_2)
    print(f"Training Accuracy LR {key} - Similar Words: {model_2.score(X_train_2, y_train_2):.2f}")

Training Accuracy LR abortion - Common Words: 0.59
Training Accuracy LR abortion - Similar Words: 0.58
Training Accuracy LR gay_marriage - Common Words: 0.66
Training Accuracy LR gay_marriage - Similar Words: 0.64
Training Accuracy LR darwin_theory_of_evolution - Common Words: 0.57
Training Accuracy LR darwin_theory_of_evolution - Similar Words: 0.64
