In [1]:
import json
import numpy as np
import random
from tqdm import tqdm
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.decomposition import PCA
import re
import matplotlib.pyplot as plt
import sklearn

In [2]:
def rand_emot():
    e = ["(o_o)",":-)",":P",":D","x)","ᓚᘏᗢ","╯°□°）╯︵ ┻━┻",":)",
         "*<:-)","^_^","(⌐■_■)","¯\_(ツ)_/¯", "(T_T)",":o","OwO",
        "( ͡❛ ͜ʖ ͡❛)","(̶◉͛‿◉̶)","( ≖.≖)","(ㆆ_ㆆ)","ʕ•́ᴥ•̀ʔっ","( ◡́.◡̀)","(^◡^ )"]
    return random.choice(e)

def load_files():
    text_pairs = [] #Would be nice to have as np.array
    labels = []
    fandom = []
    
    pair_id = []
    true_id = []
    
    #Load truth JSON
    for line in open('data/modified/train_truth.jsonl'):
        d = json.loads(line.strip())
        labels.append(int(d['same']))
        true_id.append(d['id'])

    #Load actual fanfic.
    print("loading fanfic...",rand_emot())
    for line in tqdm(open('data/modified/train_pair.jsonl')):
        d = json.loads(line.strip())
        text_pairs.append(d['pair'])
        fandom.append(d['fandoms'])
        pair_id.append(d['id'])

    print("done loading",rand_emot())
    
    return text_pairs, labels, fandom, pair_id, true_id

In [3]:
text_pairs, labels, fandom, pair_id, true_id = load_files()

173it [00:00, 1717.94it/s]

loading fanfic... :-)


1578it [00:00, 1669.11it/s]

done loading (̶◉͛‿◉̶)





In [4]:
labels[:20]

[0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1]

# Feature extraction

Word frequency and word frequency distribution

In [5]:
def frequency_distribution(text_pair): #expect untokenized input
    
    pair = []
    
    for text in text_pair: 
        tokens = nltk.word_tokenize(text) #tokenize
        
        freq_dist = nltk.FreqDist(tokens) #compute frequency distribution
        pair.append(freq_dist)
        
    return pair #return frequency distribution of each fanfic in the input pair

In [6]:
def word_freq(text_pair): #expects tokenized pairs
    fdist0 = nltk.FreqDist(text_pair[0])
    fdist1 = nltk.FreqDist(text_pair[1])
    
    return [fdist0, fdist1]

def word_freq_single(text):
    fdist = nltk.FreqDist(text)
    return fdist

def tokenize(text_pair):
    return [nltk.word_tokenize(text_pair[0]),nltk.word_tokenize(text_pair[1])]

def vector_freq_dist(freq_dists): #I don't think this works...
    return [list(freq_dists[0].values()), list(freq_dists[1].values())]

def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [7]:
def create_corpus(text_pairs):
    '''input all text pairs to create a corpus'''
    corpus = [x[i] for x in text_pairs for i in range(len(x))]
    return corpus

def fit_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    print("training vectorizer...",rand_emot())
    X = vectorizer.fit_transform(corpus)
    print("vectorizer fit!", rand_emot())
    
    
    df = pd.DataFrame(X[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    
    return X, df

In [8]:
corpus = create_corpus(text_pairs)

In [9]:
#tf-idf on the raw text. Likely not useful, as you can see, it is sesnitive to the fandom.
raw_tfidf, tfidf_df = fit_tfidf(corpus)

training vectorizer... (^◡^ )
vectorizer fit! (̶◉͛‿◉̶)


In [10]:
tfidf_df.head(10)

Unnamed: 0,TF-IDF
the,0.389299
to,0.29567
kuroko,0.283424
judgement,0.247891
was,0.184794
and,0.182388
it,0.18233
that,0.172474
she,0.169618
her,0.146488


In [11]:
#Attempting to perform tf-idf on only symbols.
def isolate_symbols(corpus):
    #Add \d to omit digits too.
    sym_corpus = []
    for text in corpus:
        sym_corpus.append(' '.join(re.findall("[^a-zA-Z\s]+", text)))
    return sym_corpus

symbols = isolate_symbols(corpus)

#Okay, tf-idf doesn't work with symbols. I'll convert them to made-up words

POS-tagging and Ngrams

In [14]:
## POS Tagging and ngrams
tokens = nltk.word_tokenize(corpus[0])
pos_tags = nltk.pos_tag(tokens)
pos_bigrams = nltk.bigrams(pos_tags)

LIX calculation - LIX = readability index, a measure for the readability of a text

In [15]:
def compute_lix(text):
    tokens = nltk.word_tokenize(text)
    splt = text.split()
    o = len(splt)+1
    p = len([x for x in tokens if x=='.'])+1
    l = len([x for x in tokens if len(x)>6])+1
    
    return (o/p)+((l*100)/o)

In [16]:
for text in corpus[:10]:
    print(compute_lix(text))

29.910954831914932
30.709476503232075
34.64747315241397
31.16576505295295
35.736950980754315
27.826830313919487
23.657931484369428
30.435502148277408
29.67444070920108
27.62315961549868


Sentence and word length - compute sentence and word length distribution

In [17]:
def remove_symbols(text):
    sentences = re.split('[\.+|!|?]', text)
    sentences = [re.sub(r"[^\w]+", ' ', x) for x in sentences if len(x.strip()) != 0]
    return ' '.join(sentences)

def get_sent_word_length(text):
    #Function, which removes symbols and count words in sentence
    #Output: length of each sentence & length of each word
    sentences = re.split('[\.+|!|?]', text)
    sentences = [re.sub(r"[^\w]+", ' ', x) for x in sentences if len(x.strip()) != 0]
    word_sentences = [nltk.word_tokenize(x) for x in sentences]
    sentence_lengths = np.array([len(x) for x in word_sentences])
    word_lengths = np.array([len(s) for x in word_sentences for s in x])
    return sentence_lengths, word_lengths

get_sent_word_length(corpus[0])

(array([ 7, 10, 17,  7, 13,  3,  3, 10, 20,  2,  7,  4,  3, 13, 16, 17, 27,
         6, 13, 20,  1, 22, 20, 15, 14, 12,  7, 14, 15,  7,  1,  9, 15, 21,
         5,  2, 14,  3, 23,  8, 11,  1,  6, 12,  9,  8,  8,  3,  9, 14,  1,
         8,  0,  2,  3, 14,  6,  2,  3,  8,  2,  2, 12,  9, 15,  2, 17, 17,
         4, 14,  8,  7, 23, 11,  3, 18, 23,  6,  7, 23, 24, 34, 10,  2,  2,
        12,  2,  5, 11, 14,  4, 12, 16, 17, 12,  2,  3,  3,  2,  6, 24, 26,
         1,  3, 23,  2,  3,  3,  2, 26,  2,  5, 23,  3,  7, 15, 29, 22, 25,
        11,  4, 10,  8,  4, 33,  5,  7,  7,  8, 20, 27, 15, 18, 13, 10, 12,
         4, 13, 17, 11,  2, 13,  9,  2,  5, 11, 20, 29, 19,  0,  3,  2,  4,
        14,  1,  9,  3,  8,  0,  2,  8,  5, 19,  8, 17, 21, 17, 16, 21, 26,
         9,  5, 32,  6, 13, 15,  3, 17, 29,  0,  4, 10,  1,  7,  3,  2,  4,
        13,  5,  0,  1, 14,  3, 24, 10,  1,  3,  2, 18,  1,  4,  2, 18,  5,
        25,  2,  5,  4,  2, 25,  8,  1,  8,  5,  6, 10,  2,  1,  2, 10, 19,
         0, 

Isolating function words

In [18]:
data = 'data/modified/train_pair.jsonl'

In [19]:
with open('data/function_words_clean.txt', "r") as fw:
    func_words = fw.read().split()

def isolate_fw(data, f_words): #data must be json file - input must be path to file 
    fw_in_data = []

    for line in tqdm(open(data)):
        function_words = []
        d = json.loads(line.strip()) #load the json file
        text = d.get("pair") #get the actual fanfic
        words = text[0].split() #split fanfic into words in list
        for word in words: 
            if word in f_words: #if the word is a function word
                function_words.append(word)
                
        stringed_function_words = " ".join(function_words)
        
        #append all function words as one long string in a list
        fw_in_data.append([stringed_function_words]) #fw_in_data is a list with lists
        #each list contains a string of all function words for each pair
        #should it be a string for each pair?
        
    return fw_in_data


In [20]:
test = isolate_fw(data, func_words) #why is this so fast?


1578it [00:21, 75.10it/s]


In [21]:
#Trying to simplifying isolation of function words - make it take text_pairs as input so we only load the files once
# and use the same kind of input in our various functions

#this made it hella slow - fix it!

def isolate_fw_2(data, f_words): #data must be the text_pairs from load_files()
    fw_in_data = []
    
    fw_text_pairs = []
    for pair in tqdm(data):
        fw_text_pairs = []
        for text in pair: 
            function_words = []
            
            words = text.split() #split fanfic into words in list
            
            for word in words: 
                if word in f_words: #if the word is a function word
                    function_words.append(word)
                
            stringed_function_words = " ".join(function_words) #for each fanfic in a pair, makes FW a long string. 
            fw_text_pairs.append(stringed_function_words) 
            
        #append text pairs with only their function words
        fw_in_data.append(fw_text_pairs) 
        
    return fw_in_data


In [22]:
fw_inData_2 = isolate_fw_2(text_pairs, func_words)

100%|██████████████████████████████████████████████████████████████████████████████| 1578/1578 [00:33<00:00, 47.12it/s]


In [23]:
frequency_distribution(fw_inData_2[0])

[FreqDist({'the': 134, 'to': 118, 'a': 91, 'was': 72, 'and': 64, 'of': 58, 'in': 53, 'her': 45, 'that': 43, 'for': 38, ...}),
 FreqDist({'the': 111, 'and': 102, 'to': 98, 'a': 61, 'her': 56, 'his': 47, 'of': 45, 'she': 42, 'in': 41, 'he': 37, ...})]

Isolating profanity

In [24]:
data = 'data/modified/train_pair.jsonl'
with open('data/profanity_words_clean.txt', "r") as pr:
    prof_words = pr.read().split()
del prof_words[:4]
del prof_words[2]

In [25]:
data = 'data/modified/train_pair.jsonl'

In [26]:
def isolate_profanity(data, prof_words):
    profanity = []
    
    for pair in tqdm(data):
        profanity_pairs = []
        for text in pair:
            resultwords = []

            #d = json.loads(line.strip())
            #text = d.get("pair") 
            words = text.split() 

            resultwords  = [word for word in words if word.lower() in prof_words]

            result = " ".join(resultwords)
            profanity_pairs.append(result)
        
        profanity.append(profanity_pairs) 

        
    return profanity

In [27]:
profanity_inData = isolate_profanity(text_pairs, prof_words)

100%|██████████████████████████████████████████████████████████████████████████████| 1578/1578 [01:05<00:00, 24.27it/s]


In [28]:
frequency_distribution(profanity_inData[0])

[FreqDist({'balls': 5, 'bastard': 1, 'hell': 1}),
 FreqDist({'bloody': 2, 'shit': 2, 'hell': 2, 'fucking': 2})]

Yule's K computations - different implementations

(a) Our own implementation

In [36]:
def tokenize_no_symbols(text):
    return nltk.word_tokenize(re.sub(r'[^\w]', ' ', text))

def get_fdist_yule(text):
    text = tokenize_no_symbols(text)
    fdist = word_freq_single(text)
    return fdist
        
def get_num_unique_words(text):
    text = tokenize_no_symbols(text.lower())
    return len(set(text))

#Remove this function. It is incorrectly implemented.
def yules_k(text):
    C = 10000
    splt = text.split()
    N = len(splt)
    Vn = get_num_unique_words(text)
    fdist = get_fdist_yule(text)
    max_word = fdist.most_common()[0][1]
    
    var = 0
    
    for m in range(max_word):
        Vmn = len([x for x in fdist if fdist[x]==m])
        mNs = np.power(m/N,2)
        
        var += Vmn*mNs
    
    return C*(-1/N+var)

In [30]:
for i in corpus[:10]:
    k,lix = yules_k(i), compute_lix(i)
    print(k, lix, k-lix)

54.62083541967105 29.910954831914932 24.70988058775612
50.06509040692571 30.709476503232075 19.355613903693637
68.87825528810403 34.64747315241397 34.230782135690056
69.87291123035095 31.16576505295295 38.707146177398
57.89319891883996 35.736950980754315 22.15624793808564
67.16115024294662 27.826830313919487 39.33431992902713
63.76863861014663 23.657931484369428 40.1107071257772
85.34784357474064 30.435502148277408 54.91234142646323
52.83657559952693 29.67444070920108 23.162134890325852
59.121650604821426 27.62315961549868 31.498490989322747


In [37]:
for i in corpus[:10]:
    k = yules_k(i)
    print(k)

54.62083541967105
50.06509040692571
68.87825528810403
69.87291123035095
57.89319891883996
67.16115024294662
63.76863861014663
85.34784357474064
52.83657559952693
59.121650604821426


(b) Implementation below from:
https://gist.github.com/magnusnissel/d9521cb78b9ae0b2c7d6

In [38]:
import collections
import re

def tokenize(s):
    tokens = re.split(r"[^0-9A-Za-z\-'_]+", s)
    return tokens

def get_yules(s):
    """ 
    Returns a tuple with Yule's K and Yule's I.
    (cf. Oakes, M.P. 1998. Statistics for Corpus Linguistics.
    International Journal of Applied Linguistics, Vol 10 Issue 2)
    In production this needs exception handling.
    """
    tokens = tokenize(s)
    token_counter = collections.Counter(tok.upper() for tok in tokens)
    m1 = sum(token_counter.values())
    m2 = sum([freq ** 2 for freq in token_counter.values()])
    i = (m1*m1) / (m2-m1)
    k = 1/i * 10000
    return (k, i)

In [39]:
for text in corpus[:10]: 
    k_i = get_yules(text)
    print(k_i)

(76.24848330128657, 131.15014970837154)
(66.47285249322603, 150.4373533694685)
(93.3293041366282, 107.1474826959026)
(106.93616216764885, 93.51373564653021)
(109.14087031159232, 91.62470458088198)
(91.70043097696845, 109.05074156643394)
(83.6476796371318, 119.54904240476895)
(94.47986224640348, 105.8426606711174)
(83.97021309490067, 119.08984902418068)
(88.22340444450815, 113.34860701606596)


(c) Implementation below from: https://swizec.com/blog/measuring-vocabulary-richness-with-python/

In [40]:
from nltk.stem.porter import PorterStemmer
from itertools import groupby

def words(entry):
    return filter(lambda w: len(w) > 0,
                  [w.strip("0123456789!:,.?(){}[]") for w in entry.split()])

def yule(entry):
    # yule's I measure (the inverse of yule's K measure)
    # higher number is higher diversity - richer vocabulary
    d = {}
    stemmer = PorterStemmer()
    for w in words(entry):
        w = stemmer.stem(w).lower()
        try:
            d[w] += 1
        except KeyError:
            d[w] = 1

    M1 = float(len(d))
    M2 = sum([len(list(g))*(freq**2) for freq,g in groupby(sorted(d.values()))])

    try:
        return (M1*M1)/(M2-M1)
    except ZeroDivisionError:
        return 0

In [41]:
for text in corpus[:10]: 
    yules_i = yule(text) #yules_i = inverse of yules K  - why do you want the inverse instead? 
    print(yules_i)

10.645130247537017
17.728321988754068
6.742396307162275
8.927175336111278
7.337997328744253
7.3147799405225005
8.430640063377464
11.789875705444068
9.429946152509988
10.677045985641614


In [42]:
for text in corpus[:2]: 
    tokenized_words = words(text)  
    print(tokenized_words)

<filter object at 0x0000026DBF325608>
<filter object at 0x0000026DBF341B48>


Misspellings

In [43]:
from spellchecker import SpellChecker

def misspelled_words(text):
    #Library for spell checking
    spell = SpellChecker()
    text = remove_symbols(text)
    #Regex for finding digits
    _digits = re.compile('\d')

    #List of misspelled words
    misspelled = spell.unknown(text.split())
    #Remove words, that start with capital letter (Likely names)
    no_names = [x for x in misspelled if x.title() not in text]
    #Remove words that contain digits (7th)
    no_digits = [x for x in no_names if not bool(_digits.search(x))]
    
    #Find corrections for misspelled words - if word is more than a single character.
    corrections = [spell.correction(x) for x in no_digits if len(x)>1]
    #Remove corrections, if they have no correction (likely misclassified spelling mistake)
    remove_no_correction = [x for x in corrections if x not in misspelled]
    return remove_no_correction

misspelled_words(corpus[1])

['fastball',
 'cooked',
 'alive',
 'all',
 'voiceover',
 'he',
 'peace',
 'sis',
 'cancer',
 'tad',
 'threatened',
 'nonstop',
 'ooh',
 'rank',
 'capulets',
 'default',
 'la',
 'herface',
 'dizzy',
 'pandora',
 'ranks',
 'mortys',
 'encompass']

# Classification

Logistic Regression

SVM

Random Forest