In [1]:
import numpy as np
import json
import re
import numpy as np
import random
from tqdm import tqdm
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [2]:
def rand_emot():
    e = ["(o_o)",":-)",":P",":D","x)","ᓚᘏᗢ","╯°□°）╯︵ ┻━┻",":)",
         "*<:-)","^_^","(⌐■_■)","¯\_(ツ)_/¯", "(T_T)",":o","OwO",
        "( ͡❛ ͜ʖ ͡❛)","(̶◉͛‿◉̶)","( ≖.≖)","(ㆆ_ㆆ)","ʕ•́ᴥ•̀ʔっ","( ◡́.◡̀)","(^◡^ )"]
    return random.choice(e)

In [3]:
with open("../data/function_words.txt", "r") as f: 
    fw = f.readlines()
    
clean_fw = open("../data/function_words_clean.txt", "w")

func_words = []
for line in fw: 
    result = re.sub(r'[^A-Za-z]', '', line) #strip from digits
    clean_fw.writelines(result+"\n") #write to new file - one word per line
    
clean_fw.close()

In [4]:
data = '../data/modified/train_pair.jsonl'
with open('../data/function_words_clean.txt', "r") as fw:
    func_words = fw.read().split()

def isolate_fw(data, f_words): #data must be json file - input must be path to file 
    fw_in_data = []

    for line in tqdm(open(data)):
        function_words = []
        d = json.loads(line.strip()) #load the json file
        text = d.get("pair") #get the actual fanfic text
        words = text[0].split() #split fanfic into words in list
        for word in words: 
            if word in f_words: #if the word is a function word
                function_words.append(word)
                
        stringed_function_words = " ".join(function_words)
        
        #append all function words as one long string in a list
        fw_in_data.append([stringed_function_words]) #fw_in_data is a list with lists
        #each list contains a string of all function words for each pair
        #should it be a string for each pair?
        
    return fw_in_data

In [12]:
func_words

['i',
 'we',
 'you',
 'he',
 'she',
 'it',
 'they',
 'me',
 'us',
 'him',
 'her',
 'them',
 'myself',
 'ourselves',
 'yourself',
 'yourselves',
 'herself',
 'himself',
 'itself',
 'themselves',
 'someone',
 'anyone',
 'noone',
 'everyone',
 'nobody',
 'something',
 'anything',
 'nothing',
 'everything',
 'whoever',
 'whatever',
 'others',
 'mine',
 'ours',
 'yours',
 'hers',
 'theirs',
 'my',
 'our',
 'your',
 'his',
 'its',
 'their',
 'one',
 'first',
 'second',
 'third',
 'once',
 'this',
 'these',
 'that',
 'those',
 'a',
 'an',
 'the',
 'all',
 'alone',
 'another',
 'any',
 'both',
 'each',
 'either',
 'enough',
 'every',
 'few',
 'former',
 'latter',
 'last',
 'least',
 'less',
 'lot',
 'lots',
 'many',
 'more',
 'most',
 'much',
 'neither',
 'next',
 'none',
 'only',
 'other',
 'several',
 'same',
 'some',
 'such',
 'top',
 'whole',
 'and',
 'but',
 'or',
 'nor',
 'although',
 'as',
 'because',
 'if',
 'while',
 'however',
 'whenever',
 'wherever',
 'whether',
 'whyever',
 'there

In [5]:
fw_in_data = isolate_fw(data,func_words)

1578it [00:29, 53.71it/s]


In [6]:
def create_corpus(text_pairs):
    '''input all text pairs to create a corpus'''
    corpus = [x[i] for x in text_pairs for i in range(len(x))]
    return corpus #return one list with each input text in seperate strings rather than list of lists (as the input is)

def fit_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    print("training vectorizer...",rand_emot())
    X = vectorizer.fit_transform(corpus)
    print("vectorizer fitted!", rand_emot())
    
    
    df = pd.DataFrame(X[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    
    return df

In [7]:
fw_corpus = create_corpus(fw_in_data)

In [8]:
fw_tfidf = fit_tfidf(fw_corpus)

training vectorizer... (^◡^ )
vectorizer fitted! ¯\_(ツ)_/¯


In [9]:
fw_tfidf.head(15)

Unnamed: 0,TF-IDF
the,0.518703
to,0.456768
was,0.278706
and,0.247739
of,0.224513
in,0.205159
her,0.182784
that,0.166449
for,0.147095
had,0.135912


In [10]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, TruncatedSVD
import scipy

def show_me_pca(vector, labels, is_pairs=False):
    '''Plot PCA for the two classes. Input is one long vector/list, it creates pairs itself.
    If you already created pairs, use is_pairs=True
    Example: if vector is distance between pairs, do not set is_pairs to True. 
    (as it is one vector describing both documents.)
    
    labels is simply a vector with the labels, which will be used to colour the scatter plot.'''
    
    #Convert labels to np.array (might be a list.)
    labels = np.array(labels)
    
    #Join pairs into one, long vector if necessary. 
    if not is_pairs:
        vector = [np.hstack([vector[x],vector[x+1]]) for x in range(0,len(vector),2)]  
    
    #Get that PCA - Use SVD if vector is sparse.
    if scipy.sparse.issparse(vector):
        pca = TruncatedSVD(n_components=2)
    else:
        pca = PCA(n_components=2)
    pcs = pca.fit_transform(vector)
    
    #Printing pcs shape - remember they might be halved, due to pairing. 
    print(pcs.shape)
    
    #Group PC's into two, according to label indices. 
    group1 = pcs[labels==0]
    group2 = pcs[labels==1]
    
    #Plot that shit!
    plt.scatter(group1[:,0], group1[:,1], s=5)
    plt.scatter(group2[:,0], group2[:,1], s=5)

#Example input:
#show_me_pca(tfidf_arr[:400], labels[:200])
#half the amount of labels! - data is not concatenated in pairs.

#show_me_pca(cosine_sim[:200], labels[:200], is_pairs=True)
#Equal number of labels and vectors - data is a single vector per pair.