In [32]:
import json
import numpy as np
import random
from tqdm import tqdm
import nltk
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD
import re
import matplotlib.pyplot as plt
import sklearn
import scipy
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPClassifier

In [33]:
def rand_emot():
    e = ["(o_o)",":-)",":P",":D","x)","ᓚᘏᗢ","╯°□°）╯︵ ┻━┻",":)",
         "*<:-)","^_^","(⌐■_■)","¯\_(ツ)_/¯", "(T_T)",":o","OwO",
        "( ͡❛ ͜ʖ ͡❛)","(̶◉͛‿◉̶)","( ≖.≖)","(ㆆ_ㆆ)","ʕ•́ᴥ•̀ʔっ","( ◡́.◡̀)","(^◡^ )"]
    return random.choice(e)

def load_files():
    text_pairs = [] #Would be nice to have as np.array
    labels = []
    fandom = []
    
    pair_id = []
    true_id = []
    
    #Load truth JSON
    for line in open('data/modified/train_truth.jsonl'):
        d = json.loads(line.strip())
        labels.append(int(d['same']))
        true_id.append(d['id'])

    #Load actual fanfic.
    print("loading fanfic...",rand_emot())
    for line in tqdm(open('data/modified/train_pair.jsonl')):
        d = json.loads(line.strip())
        text_pairs.append(d['pair'])
        fandom.append(d['fandoms'])
        pair_id.append(d['id'])

    print("done loading",rand_emot())
    
    return text_pairs, labels, fandom, pair_id, true_id

In [34]:
text_pairs, labels, fandom, pair_id, true_id = load_files()

509it [00:00, 2503.64it/s]

loading fanfic... (o_o)


15780it [00:06, 2532.27it/s]

done loading :o





In [35]:
features = {
    'character_bigram':None, #Character bigram
    'skip_bigram': None, #Word skipgram
    'pos_skipgram': None, #POS skipgram
    'pos_bigram' : None, #POS bigram
    'character_bigram_cossim':None, #Character bigram
    'skip_bigram_cossim': None, #Word skipgram
    'pos_skipgram_cossim': None, #POS skipgram
    'pos_bigram_cossim' : None, #POS bigram
}

In [36]:
dumb_text = 11550

text_pairs.pop(dumb_text)
labels.pop(dumb_text)
#egegege #FIND OUT WHICH INDEX IS WEIRD

1

In [37]:
def create_corpus(text_pairs):
    '''input all text pairs to create a corpus'''
    corpus = [x[i] for x in text_pairs for i in range(len(x))]
    return corpus

In [38]:
corpus = create_corpus(text_pairs)
labels = labels

## PCA Function

In [39]:
def show_me_pca(vector, labels, is_pairs=False, title=""):
    '''Plot PCA for the two classes. Input is one long vector/list, it creates pairs itself.
    If you already created pairs, use is_pairs=True
    Example: if vector is distance between pairs, do not set is_pairs to True. 
    (as it is one vector describing both documents.)
    
    labels is simply a vector with the labels, which will be used to colour the scatter plot.'''
    
    #Convert labels to np.array (might be a list.)
    labels = np.array(labels)
    
    #Join pairs into one, long vector if necessary. 
    if not is_pairs:
        if scipy.sparse.issparse(vector):
            vector = [np.hstack([vector[x],vector[x+1]]) for x in range(0,vector.shape[0],2)]  
        else:
            vector = [np.hstack([vector[x],vector[x+1]]) for x in range(0,len(vector),2)]  
    
    #Get that PCA - Use SVD if vector is sparse.
    if scipy.sparse.issparse(vector):
        
        pca = TruncatedSVD(n_components=2)
    else:
        pca = PCA(n_components=2)
    pcs = pca.fit_transform(vector)
    
    #Printing pcs shape - remember they might be halved, due to pairing. 
    print(pcs.shape)
    
    #Group PC's into two, according to label indices. 
    group1 = pcs[labels==0]
    group2 = pcs[labels==1]
    
    #Plot that shit!
    plt.scatter(group1[:,0], group1[:,1], s=5)
    plt.scatter(group2[:,0], group2[:,1], s=5)
    plt.title(title)

## Function Word Feature Exploration

In [40]:
#Isolate function words

with open('data/function_words_clean.txt', "r") as fw:
    func_words = fw.read().split()

def isolate_fw(corpus, f_words): #data must be json file - input must be path to file 
    fw_in_data = []

    for text in tqdm(corpus):
        function_words = []
        words = text.split() #split fanfic into words in list
        for word in words: 
            if word in f_words: #if the word is a function word
                function_words.append(word)
                
        stringed_function_words = " ".join(function_words)
        
        #append all function words as one long string in a list
        fw_in_data.append(stringed_function_words) #fw_in_data is a list with strings
        #each list contains a string of all function words for each pair
        #should it be a string for each pair?
        
    return fw_in_data

In [41]:
#function_words = isolate_fw(corpus, func_words)
#fw_tokens = [nltk.word_tokenize(x) for x in function_words]
#unigram_freqdists = [nltk.FreqDist(x) for x in fw_tokens]
#bigrams = [nltk.bigrams(x) for x in fw_tokens]
#bigram_freqdists = [nltk.FreqDist(x) for x in bigrams]

In [42]:
def get_mat_from_fdists(fdists):
    '''This function compiles a list of frequency distributions and create a matrix with feature vectors.
    Each row is a feature vector. Should would with all typed of dictionaries, actually.'''
    all_keys = list(set([item for sublist in fdists for item in sublist]))
    matrix = np.zeros((len(fdists),len(all_keys)))
    
    for i, fd in enumerate(tqdm(fdists)):
        for j, key in enumerate(all_keys):
            matrix[i,j] = fd[key]
    
    return matrix

In [43]:
#v1 = normalize(get_mat_from_fdists(unigram_freqdists))
#v1 = normalize(get_mat_from_fdists(bigram_freqdists))

## Character ngram

In [44]:
def create_character_ngram(n, corpus):
    #Will return matrix where each row is a pair of texts
    vectorizer = TfidfVectorizer(max_features=3000, analyzer='char',ngram_range=(n, n))
    char_bigram = vectorizer.fit_transform(corpus).toarray()
    return char_bigram

In [45]:
char_bigrams = create_character_ngram(2, corpus)
char_bigrams_pair = np.array([np.concatenate((char_bigrams[x], char_bigrams[x+1])) for x in range(0,char_bigrams.shape[0],2)])

## Save features function

In [46]:
import pickle
def save_features(feature_dict, filename='features'):
    '''Save the updated feature dictionary. Takes dictionary as input and saves as binary file
    
    example: 
    >>> my_featues = {'freqdist': [1,6,3,5]}
    >>> save_features(my_features)'''
    
    with open('data/{}.dat'.format(filename), 'wb') as file:
        pickle.dump(feature_dict, file)
    print("Features saved! :-)")

def load_features(filename='features'):
    '''Load feature dictionary. Returns the saved feature as a dictionary.
    
    example: 
    >>> my_features = load_features()'''
    
    with open('data/{}.dat'.format(filename), 'rb') as file:
        feats = pickle.load(file)
    print("Features available:")
    for i in feats.keys():
        print(i)
    return feats

## Skip-gram

In [47]:
#Stolen from https://stackoverflow.com/questions/31847682/how-to-compute-skipgrams-in-python
from itertools import chain, combinations
import copy
from nltk.util import ngrams

corpus_tokens = [nltk.word_tokenize(x) for x in corpus]

def pad_sequence(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
    if pad_left:
        sequence = chain((pad_symbol,) * (n-1), sequence)
    if pad_right:
        sequence = chain(sequence, (pad_symbol,) * (n-1))
    return sequence

def skipgrams(sequence, n, k, pad_left=False, pad_right=False, pad_symbol=None):
    sequence_length = len(sequence)
    sequence = iter(sequence)
    sequence = pad_sequence(sequence, n, pad_left, pad_right, pad_symbol)

    if sequence_length + pad_left + pad_right < k:
        raise Exception("The length of sentence + padding(s) < skip")

    if n < k:
        raise Exception("Degree of Ngrams (n) needs to be bigger than skip (k)")    

    history = []
    nk = n+k

    # Return point for recursion.
    if nk < 1: 
        return
    # If n+k longer than sequence, reduce k by 1 and recur
    elif nk > sequence_length: 
        for ng in skipgrams(list(sequence), n, k-1):
            yield ng

    while nk > 1: # Collects the first instance of n+k length history
        history.append(next(sequence))
        nk -= 1

    # Iterative drop first item in history and picks up the next
    # while yielding skipgrams for each iteration.
    for item in sequence:
        history.append(item)
        current_token = history.pop(0)      
        # Iterates through the rest of the history and 
        # pick out all combinations the n-1grams
        for idx in list(combinations(range(len(history)), n-1)):
            ng = [current_token]
            for _id in idx:
                ng.append(history[_id])
            yield tuple(ng)

    # Recursively yield the skigrams for the rest of seqeunce where
    # len(sequence) < n+k
    for ng in list(skipgrams(history, n, k-1)):
        yield ng

In [48]:
def create_skip_gram(input_text, n=2, k=2):
    #Creates count vector of skip-gram of text. (Output is paired)
    n,k=2,2
    vectorizer = TfidfVectorizer(max_features=4000, ngram_range=(n, n))
    skipgram = []
    for i in tqdm(range(len(input_text))):
        #Join skipgrams to one single text for tfidf - will use ngrams anyway.
        skipgram.append(' '.join([' '.join(x) for x in list(skipgrams(input_text[i], n=n, k=k))]))

    fd_mat = vectorizer.fit_transform(skipgram).toarray()
    return fd_mat

skipgram_bigram = create_skip_gram(corpus_tokens)
skipgram_bigram_pair = np.array([np.concatenate((skipgram_bigram[x], skipgram_bigram[x+1])) for x in tqdm(range(0,len(skipgram_bigram),2))])

100%|████████████████████████████████████████████████████████████████████████████| 31558/31558 [11:06<00:00, 47.35it/s]
100%|█████████████████████████████████████████████████████████████████████████| 15779/15779 [00:01<00:00, 11526.39it/s]


In [49]:
pos_tags = [nltk.pos_tag(x) for x in corpus_tokens]

single_pos_tags = []
for text in pos_tags:
    text_tags = []
    for tags in text:
        text_tags.append(tags[1])
    single_pos_tags.append(text_tags)

POS_skipgram = create_skip_gram(single_pos_tags)

100%|████████████████████████████████████████████████████████████████████████████| 31558/31558 [48:23<00:00, 10.87it/s]


In [50]:
POS_skipgram_pair = np.array([np.concatenate((POS_skipgram[x], POS_skipgram[x+1])) for x in tqdm(range(0,len(POS_skipgram),2))])

vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(2, 2))
POS_bigram = vectorizer.fit_transform(corpus).toarray()
POS_bigram_pair = np.array([np.concatenate((POS_bigram[x], POS_bigram[x+1])) for x in tqdm(range(0,len(POS_bigram),2))])

100%|█████████████████████████████████████████████████████████████████████████| 15779/15779 [00:00<00:00, 19162.31it/s]
100%|█████████████████████████████████████████████████████████████████████████| 15779/15779 [00:01<00:00, 15502.32it/s]


## Cosine similarity 

In [51]:
def cosine_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [52]:
char_bigrams_cosine = np.array([cosine_sim(char_bigrams[x], char_bigrams[x+1]) for x in range(0, char_bigrams.shape[0],2)])
skip_bigrams_cosine = np.array([cosine_sim(skipgram_bigram[x], skipgram_bigram[x+1]) for x in range(0, skipgram_bigram.shape[0],2)])
pos_bigrams_cosine = np.array([cosine_sim(POS_bigram[x], POS_bigram[x+1]) for x in range(0, POS_bigram.shape[0],2)])
pos_skipgram_cosine = np.array([cosine_sim(POS_skipgram[x], POS_skipgram[x+1]) for x in range(0, POS_skipgram.shape[0],2)])

In [53]:
features['character_bigram'] = char_bigrams_pair
features['skip_bigram'] = skipgram_bigram_pair
features['pos_skipgram'] = POS_skipgram_pair
features['pos_bigram'] = POS_bigram_pair
features['character_bigram_cossim'] = char_bigrams_cosine
features['skip_bigram_cossim'] = skip_bigrams_cosine
features['pos_skipgram_cossim'] = pos_skipgram_cosine
features['pos_bigram_cossim'] = pos_bigrams_cosine

In [60]:
save_features(features, filename="big_feats_0706")

Features saved! :-)


In [56]:
for i in features:
    print(i, type(features[i]))

character_bigram <class 'numpy.ndarray'>
skip_bigram <class 'numpy.ndarray'>
pos_skipgram <class 'numpy.ndarray'>
pos_bigram <class 'numpy.ndarray'>
character_bigram_cossim <class 'numpy.ndarray'>
skip_bigram_cossim <class 'numpy.ndarray'>
pos_skipgram_cossim <class 'numpy.ndarray'>
pos_bigram_cossim <class 'numpy.ndarray'>


In [None]:
#features = load_features(filename='feats_0706')

## Yasmin Features joining
fw distribution, profanity distribution, character bigram dist., unique words, punctuation proportion, word len, sent len, yules, lix

In [61]:
feat_matrix = np.load("feature_matrix.dat", allow_pickle=True)
feat_matrix = np.delete(feat_matrix, dumb_text, 0)

features['function_word_dist'] = feat_matrix[:,0]
features['profanity_dist'] = feat_matrix[:,1]
features['unique_words'] = feat_matrix[:,3]
features['punctuation_proportion'] = feat_matrix[:,4]
features['avg_word_length'] = feat_matrix[:,5]
features['avg_sent_length'] = feat_matrix[:,6]
features['yules'] = feat_matrix[:,7]
features['lix']= feat_matrix[:,8]

In [65]:
save_features(features, filename='big_features_yasmin')

Features saved! :-)


In [64]:
features['lix'].shape

(15779,)