### Imports

In [1]:
import import_ipynb
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
#from baseline_models import get_words, get_nouns, get_noun_pairs, get_used_noun_frequency

### old imports from base line models
import pandas as pd
import numpy as np
import ast
import random
import re

from collections import Counter, defaultdict 

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn
nltk.download('averaged_perceptron_tagger')


from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import matplotlib.pyplot as plt

import inflect
p = inflect.engine()

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\thrdl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Using TensorFlow backend.


### Code

In [2]:
def prepare_text(df):
    texts = df.text.to_list() 
    
    sentences = []
    for sublist in texts:
        sentence = []
        for word in sublist:
            word = word.lower()
            if "n't" in word:
                word = "not"
            pattern = re.compile(r'[\"\'`]')
            if pattern.findall(word):
                pass
            else:
                pattern = re.compile(r'[\.\?\!;,]')
                if pattern.findall(word):
                    if len(sentence) > 1:
                        sentences.append(" ".join(sentence))
                    sentence = []
                else:
                    flag = True
                    for char in word:
                        if ord(char) > 127:
                            flag = False
                            break
                    if flag:
                        sentence.append(word)
        if len(sentence) > 0:
            sentences.append(" ".join(sentence))

    return sentences

def tensor_label_array(embed,arrays_of_sentences_1, arrays_of_sentences_2):
    tensor_label = []
    for embedding, sentence in zip(embed, arrays_of_sentences_1):
        if sentence in arrays_of_sentences_2:
            tensor_label.append([embedding,1])
        else:
            tensor_label.append([embedding,0])
    #print(tensor_label[:5])
    return tensor_label

def get_words(df):
    sentences = df.text.to_list()    
    flat_list = []
    for sublist in sentences:
        for item in sublist:
            flat_list.append(item)           
    return flat_list

def get_noun_pairs(words):
    nouns = []
    noun_pairs = []
    porter = PorterStemmer()
    for i, (word, tag) in enumerate(words):
        if "n't" in word:
                word = "not"
        
        flag = True
        for char in word:
            if ord(char) > 127:
                flag = False
                break
        if len(word) >= 3 and tag.startswith('NN') and flag:
            if i > 0 and tag.startswith('NN') and (words[i-1][1].startswith('NN') or words[i-1][1].startswith('JJ') or words[i-1][1].startswith('PRP')):
                w1 = word
                w0 = ""
                for char in words[i-1][0]:
                    if ord(char) < 128:
                        w0 += char
                noun_pairs.append(w0+" "+w1)

            else:
                nouns.append(word)
                #nouns.append(porter.stem(word)) #convert words to their basic form
    return nouns, noun_pairs

def evaluate(tensor_label, df, label):
    accuracy = 0
    
    model = "https://tfhub.dev/google/elmo/3"
    hub_layer = hub.KerasLayer(model, output_key="default",
                           trainable=False)
    m = tf.keras.metrics.CosineSimilarity(axis=0)
    
    ctn = 0
    for row in df.iterrows():
        print(f"word: {ctn}")
        ctn +=1
        
        words_1 = get_words(val_1)
        tags_1 = nltk.pos_tag(words_1)
        nouns_1, pairs_1 = get_noun_pairs(tags_1)
        total_len = 0
        
        if len(pairs_1) > 0:
            mixture = np.array(pairs_1, dtype=bytes)
            total_len = len(pairs_1)
        else:
            mixture = np.array(nouns_1, dtype=bytes)
            total_len = len(nouns_1)
        
    
        embed_all = hub_layer(mixture)
        embed_all = tf.slice(embed_all, [0, 0], [total_len, 300])
        
        vote = 0
        for embed in embed_all:
            cos_max = -1
            label_vote = 0
            for tensors in tensor_label:
                m.update_state(embed, tensors[0])
                new_cos = m.result().numpy()
                if new_cos > cos_max:
                    cos_max = new_cos
                    label_vote = tensors[1]
            if label_vote == 0:
                vote -= 1
            else:
                vote += 1
        if (label == 0 and vote < 0) or (label == 1 and vote >= 0):
            accuracy += 1
    return accuracy


In [3]:
topics = { "abortion": ["abortion_pro_choice.csv", "abortion_pro_life.csv"], 
           #"gay_marriage":["gay_marriage_for.csv", "gay_marriage_against.csv"],
           #"darwin_theory_of_evolution" :["darwin_theory_of_evolution_for.csv", "darwin_theory_of_evolution_against.csv"],
          #"marijuana_legalization" :["marijuana_legalization_against.csv", "marijuana_legalization_for.csv"],
         }


for key, pair in topics.items():

    df_1 = pd.read_csv(f"./dataset_processed/{pair[0]}", converters={2:ast.literal_eval})
    df_2 = pd.read_csv(f"./dataset_processed/{pair[1]}", converters={2:ast.literal_eval})
    
    tr_1, ts_1 = train_test_split(df_1, test_size=0.2, random_state=42)
    tr_2, ts_2 = train_test_split(df_2, test_size=0.2, random_state=42)
    
    tr_1, val_1 = train_test_split(tr_1, test_size=0.25, random_state=25)
    tr_2, val_2 = train_test_split(tr_2, test_size=0.25, random_state=25)
    
    arrays_of_sentences_1 = prepare_text(tr_1)
    arrays_of_sentences_2 = prepare_text(tr_2)
    
    arrays_of_sentences_1.extend(arrays_of_sentences_2)
    random.shuffle(arrays_of_sentences_1)
    
    #mixture = arrays_of_sentences_1
    mixture = np.array(arrays_of_sentences_1, dtype=bytes)
    
    print(len(mixture))
    
    #url = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1"
    #embed = hub.load(url)
    
    model = "https://tfhub.dev/google/elmo/3"
    hub_layer = hub.KerasLayer(model, output_key="default",
                           trainable=False)
    
    embed_all = hub_layer(mixture[:100])
    embed_all = tf.slice(embed_all, [0, 0], [100, 300]) 
    
    for i in range(100, len(mixture)-2500, 500):
        print(i)
        embed = hub_layer(mixture[i:i+500])
        embed = tf.slice(embed, [0, 0], [500, 300]) 
        embed_all = tf.concat([embed_all, embed], 0) 
    print("executed")
    tensor_label = tensor_label_array(embed,arrays_of_sentences_1, arrays_of_sentences_2)
    
    acc_1 = evaluate(tensor_label, val_1, 0)
    acc_2 = evaluate(tensor_label, val_2, 1)
    
    print(f"validation err: {(acc_1+acc_2)/(len(val_1.index)+len(val_2.index))}")

6986
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


100
600
1100
1600
2100
2600
3100
3600
4100
executed
word: 0
word: 1
word: 2
word: 3
word: 4
word: 5
word: 6
word: 7
word: 8
word: 9
word: 10
word: 11
word: 12
word: 13
word: 14
word: 15
word: 16
word: 17
word: 18
word: 19
word: 20
word: 21
word: 22
word: 23
word: 24
word: 25
word: 26
word: 27
word: 28
word: 29
word: 30
word: 31
word: 32
word: 33
word: 34
word: 35
word: 36
word: 37
word: 38
word: 39
word: 40
word: 41
word: 42
word: 43
word: 44
word: 45
word: 46
word: 47
word: 48
word: 49
word: 50
word: 51
word: 52
word: 53
word: 54
word: 55
word: 56
word: 57
word: 58
word: 59
word: 60
word: 61
word: 62
word: 63
word: 64
word: 65
word: 66
word: 67
word: 68
word: 69
word: 70
word: 71
word: 72
word: 73
word: 74
word: 75
word: 76
word: 77
word: 78
word: 79
word: 80
word: 81
word: 82
word: 83
word: 84
word: 85
word: 86
word: 87
word: 88
word: 89
word: 90
word: 91
word: 92
word: 93
word: 94
word: 95
word: 96
word: 97
word: 98
word: 99
word: 100
word: 101
word: 102
word: 103
word: 104
word: 10