In [1]:
import json
import numpy as np
import random
from tqdm import tqdm
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re

In [2]:
def rand_emot():
    e = ["(o_o)",":-)",":P",":D","x)","ᓚᘏᗢ","╯°□°）╯︵ ┻━┻",":)",
         "*<:-)","^_^","(⌐■_■)","¯\_(ツ)_/¯", "(T_T)",":o","OwO",
        "( ͡❛ ͜ʖ ͡❛)","(̶◉͛‿◉̶)","( ≖.≖)","(ㆆ_ㆆ)","ʕ•́ᴥ•̀ʔっ","( ◡́.◡̀)","(^◡^ )"]
    return random.choice(e)

def load_files():
    text_pairs = [] #Would be nice to have as np.array
    labels = []
    fandom = []
    
    pair_id = []
    true_id = []
    
    #Load truth JSON
    for line in open('data/modified/train_truth.jsonl'):
        d = json.loads(line.strip())
        labels.append(int(d['same']))
        true_id.append(d['id'])

    #Load actual fanfic.
    print("loading fanfic...",rand_emot())
    for line in tqdm(open('data/modified/train_pair.jsonl')):
        d = json.loads(line.strip())
        text_pairs.append(d['pair'])
        fandom.append(d['fandoms'])
        pair_id.append(d['id'])

    print("done loading",rand_emot())
    
    return text_pairs, labels, fandom, pair_id, true_id

In [3]:
text_pairs, labels, fandom, pair_id, true_id = load_files()

596it [00:00, 2946.74it/s]

loading fanfic... :)


26300it [00:09, 2899.40it/s]

done loading (T_T)





In [4]:
def word_freq(text_pair):
    fdist0 = nltk.FreqDist(text_pair[0])
    fdist1 = nltk.FreqDist(text_pair[1])
    
    return [fdist0, fdist1]

def tokenize(text_pair):
    return [nltk.word_tokenize(text_pair[0]),nltk.word_tokenize(text_pair[1])]

def vector_freq_dist(freq_dists): #I don't think this works...
    return [list(freq_dists[0].values()), list(freq_dists[1].values())]

def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [5]:
def create_corpus(text_pairs):
    '''input all text pairs to create a corpus'''
    corpus = [x[i] for x in text_pairs for i in range(len(x))]
    return corpus

def fit_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    print("training vectorizer...",rand_emot())
    X = vectorizer.fit_transform(corpus)
    print("vectorizer fitted!", rand_emot())
    
    
    df = pd.DataFrame(X[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    
    return df

In [6]:
corpus = create_corpus(text_pairs)

In [24]:
#tf-idf on the raw text. Likely not useful, as you can see, it is sesnitive to the fandom.
tfidf = fit_tfidf(corpus)

training vectorizer... :D
vectorizer fitted! x)


In [25]:
tfidf.head()

Unnamed: 0,TF-IDF
asgore,0.75609
the,0.319951
traitre,0.286041
and,0.163852
toriel,0.153138


In [8]:
#Attempting to perform tf-idf on only symbols.
def isolate_symbols(corpus):
    #Add \d to omit digits too.
    return re.findall("[^a-zA-Z\s:]", ' '.join(corpus))

symbols = isolate_symbols(corpus)

#Okay, tf-idf doesn't work with symbols. I'll convert them to made-up words

In [9]:
#Whack function that converts symbols to letters...
def symbol2word(symbols):
    '''Takes in set of symbols and convert them to unique words'''
    symbols = list(symbols)
    sym2word = {}
    chars = [0,0,0]
    
    for i in range(len(symbols)):
        chars[2] = add_one(chars[2])
        
        if chars[2] % 58 == 0:
            chars[1] = add_one(chars[1])
            chars[2] = 0
        
        if chars[1] % 58 == 0 and chars[1] != 0:
            chars[0] = add_one(chars[0])
            chars[1] = 0
        
        
        sym2word[symbols[i]] = 'XX'+chr(chars[0]+65)+chr(chars[1]+65)+chr(chars[2]+65)
    
    return sym2word

def add_one(x):
    if x == 91-66:
        return x + 7
    return x + 1

def replace_symbols(symwords, corpus):
    for i, text in tqdm(enumerate(corpus)):
        text = nltk.word_tokenize(text)
        text = replace(text, symwords)
        corpus[i] = text
    return corpus

def replace(tokens, dictionary):
    return [dictionary.get(item, item) for item in tokens]

def chunk_replace(corpus):
    start = 0
    for i in range(0,int(len(corpus)/100),100):
        corpus[start:i] = replace_symbols(symwords, corpus[start:i])
        start = i
    corpus[start:] = replace_symbols(symwords, corpus[start:])
    
    return corpus

In [10]:
#Convert symbols to words
symwords = symbol2word(set(symbols))

#DESTROY corpus by replacing symbols with their word.
corpus = replace_symbols(symwords, corpus)

52600it [29:51, 29.36it/s]


In [None]:
start = 0
for i in range(0,int(len(corpus)/100),100):
    print(start,i)
    start = i
print(start)

In [19]:
print("joining corpus...",rand_emot())
tfidf_sym = fit_tfidf([' '.join(x) for x in corpus])

training vectorizer... ^_^
vectorizer fitted! :P


In [27]:
tfidf_sym.head(10)

Unnamed: 0,TF-IDF
asgore,0.690402
xxadt,0.344774
the,0.292154
traitre,0.261191
xxajc,0.215103
and,0.149617
toriel,0.139833
to,0.137902
of,0.092324
sans,0.0919


In [26]:
key_list = list(symwords.keys())
val_list = list(symwords.values())

key_list[val_list.index('XXADT')], key_list[val_list.index('XXAJC')]

('ψ', ',')