In [1]:
import json
import numpy as np
import random
from tqdm import tqdm
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.decomposition import PCA
import re
import matplotlib.pyplot as plt
import sklearn

In [2]:
def rand_emot():
    e = ["(o_o)",":-)",":P",":D","x)","ᓚᘏᗢ","╯°□°）╯︵ ┻━┻",":)",
         "*<:-)","^_^","(⌐■_■)","¯\_(ツ)_/¯", "(T_T)",":o","OwO",
        "( ͡❛ ͜ʖ ͡❛)","(̶◉͛‿◉̶)","( ≖.≖)","(ㆆ_ㆆ)","ʕ•́ᴥ•̀ʔっ","( ◡́.◡̀)","(^◡^ )"]
    return random.choice(e)

def load_files():
    text_pairs = [] #Would be nice to have as np.array
    labels = []
    fandom = []
    
    pair_id = []
    true_id = []
    
    #Load truth JSON
    for line in open('data/modified/train_truth.jsonl'):
        d = json.loads(line.strip())
        labels.append(int(d['same']))
        true_id.append(d['id'])

    #Load actual fanfic.
    print("loading fanfic...",rand_emot())
    for line in tqdm(open('data/modified/train_pair.jsonl')):
        d = json.loads(line.strip())
        text_pairs.append(d['pair'])
        fandom.append(d['fandoms'])
        pair_id.append(d['id'])

    print("done loading",rand_emot())
    
    return text_pairs, labels, fandom, pair_id, true_id

In [3]:
text_pairs, labels, fandom, pair_id, true_id = load_files()

178it [00:00, 1763.39it/s]

loading fanfic... *<:-)


1578it [00:00, 1786.09it/s]

done loading ¯\_(ツ)_/¯





# Feature extraction

Word frequency and word frequency distribution

In [4]:
def frequency_distribution(text_pair): #expect untokenized input
    
    pair = []
    
    for text in text_pair: 
        tokens = nltk.word_tokenize(text) #tokenize
        
        freq_dist = nltk.FreqDist(tokens) #compute frequency distribution
        pair.append(freq_dist)
        
    return pair #return frequency distribution of each fanfic in the input pair

In [5]:
def word_freq(text_pair): #expects tokenized pairs
    fdist0 = nltk.FreqDist(text_pair[0])
    fdist1 = nltk.FreqDist(text_pair[1])
    
    return [fdist0, fdist1]

def word_freq_single(text):
    fdist = nltk.FreqDist(text)
    return fdist

def tokenize(text_pair):
    return [nltk.word_tokenize(text_pair[0]),nltk.word_tokenize(text_pair[1])]

def vector_freq_dist(freq_dists): #I don't think this works...
    return [list(freq_dists[0].values()), list(freq_dists[1].values())]

def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)+0.0000000001)

In [6]:
def create_corpus(text_pairs):
    '''input all text pairs to create a corpus'''
    corpus = [x[i] for x in text_pairs for i in range(len(x))]
    return corpus

def fit_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    print("training vectorizer...",rand_emot())
    X = vectorizer.fit_transform(corpus)
    print("vectorizer fit!", rand_emot())
    
    
    df = pd.DataFrame(X[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    
    return X, df

In [7]:
corpus = create_corpus(text_pairs)

In [8]:
#tf-idf on the raw text. Likely not useful, as you can see, it is sesnitive to the fandom.
raw_tfidf, tfidf_df = fit_tfidf(corpus)

training vectorizer... ( ≖.≖)
vectorizer fit! ╯°□°）╯︵ ┻━┻


In [9]:
# raw_tfidf[:10]

In [10]:
# tfidf_df.head(10)

In [11]:
def unique_words(text_pairs):
    
    richness_all = []
    
    for pair in tqdm(text_pairs):
        richness_pair = []
        
        for text in pair: 
            tokens = nltk.word_tokenize(text.lower())
            unique_tokens = list(set(tokens))
            
            richness = len(unique_tokens) / len(tokens)
            richness_pair.append(richness)
        richness_all.append(richness_pair)
    
    feature = []
    
    for i in range(len(text_pairs)):
        rich_feat = richness_all[i][0] - richness_all[i][1]
        feature.append(np.abs(rich_feat))
    
    feature = np.vstack(feature)

    return feature

In [12]:
richness_feature = unique_words(text_pairs)

100%|██████████████████████████████████████████████████████████████████████████████| 1578/1578 [03:39<00:00,  7.19it/s]


In [13]:
#Attempting to perform tf-idf on only symbols.
def isolate_symbols(corpus):
    #Add \d to omit digits too.
    sym_corpus = []
    for text in corpus:
        sym_corpus.append(' '.join(re.findall("[^a-zA-Z\s]+", text)))
    return sym_corpus

symbols = isolate_symbols(corpus)

#Okay, tf-idf doesn't work with symbols. I'll convert them to made-up words

In [14]:
punct_matrix, punct_DF = fit_tfidf(symbols)

training vectorizer... x)
vectorizer fit! ʕ•́ᴥ•̀ʔっ


No. punctuation divided by total no. tokens

In [15]:
#Make this a little better - fewer lines

def punctuation_proportion(text_pairs, corpus, punctuation_corpus):
    feature = []
    
    punc_prop_all = [] #punctuation proportion for all pairs 
    
    for i in range(len(text_pairs)): 
        punc_prop_pair = [] #punctuation proportion for each pair
        
        punc_prop1 = len(symbols[2*i]) / len(corpus[2*i])
        punc_prop2 = len(symbols[2*i+1]) / len(corpus[2*i+1])
        punc_prop_pair.append(punc_prop1)
        punc_prop_pair.append(punc_prop2)
        punc_prop_all.append(punc_prop_pair)
    
    for i in range(len(text_pairs)):
        feature_pair = punc_prop_all[i][0] - punc_prop_all[i][1]
        feature.append(feature_pair)
        
    feature = np.vstack(np.abs(feature))
        
    return feature
    

In [16]:
punct_proportion_feature = punctuation_proportion(text_pairs, corpus, symbols)

POS-tagging and Ngrams

In [17]:
## POS Tagging and ngrams
tokens = nltk.word_tokenize(corpus[0])
pos_tags = nltk.pos_tag(tokens)
pos_bigrams = nltk.bigrams(pos_tags)

LIX calculation - LIX = readability index, a measure for the readability of a text

In [18]:
def compute_lix(text):
    tokens = nltk.word_tokenize(text)
    splt = text.split()
    o = len(splt)+1
    p = len([x for x in tokens if x=='.'])+1
    l = len([x for x in tokens if len(x)>6])+1
    
    return (o/p)+((l*100)/o)

In [19]:
# for text in corpus[:10]:
#     print(compute_lix(text))

In [70]:
lix_feature = []

for i in tqdm(range(len(text_pairs))):
    lix = compute_lix(corpus[2*1]) - compute_lix(corpus[2*i+1])
    lix_feature.append(np.abs(lix))
    
lix_feature = np.vstack(lix_feature)

100%|██████████████████████████████████████████████████████████████████████████████| 1578/1578 [03:11<00:00,  8.24it/s]


In [21]:
lix_feature

[]

Sentence and word length - compute sentence and word length distribution

In [22]:
def remove_symbols(text):
    sentences = re.split('[\.+|!|?]', text)
    sentences = [re.sub(r"[^\w]+", ' ', x) for x in sentences if len(x.strip()) != 0]
    return ' '.join(sentences)

def get_sent_word_length(text):
    #Function, which removes symbols and count words in sentence
    #Output: length of each sentence & length of each word
    sentences = re.split('[\.+|!|?]', text)
    sentences = [re.sub(r"[^\w]+", ' ', x) for x in sentences if len(x.strip()) != 0]
    word_sentences = [nltk.word_tokenize(x) for x in sentences]
    sentence_lengths = np.array([len(x) for x in word_sentences])
    word_lengths = np.array([len(s) for x in word_sentences for s in x])
    return sentence_lengths, word_lengths


In [23]:

def avg_sent_word_length(text_pairs):
    
    avg_sent_len = []
    avg_word_len = []

    for i in tqdm(range(len(text_pairs))):
        sent_length1, word_lengths1 = get_sent_word_length(corpus[i*2])
        sent_length2, word_lengths2 = get_sent_word_length(corpus[i*2+1])

        avg_sent = np.average(sent_length1) - np.average(sent_length2)
        avg_word = np.average(word_lengths1) - np.average(word_lengths2)

        avg_sent_len.append(np.abs(avg_sent))
        avg_word_len.append(np.abs(avg_word))
        
    avg_sent_len = np.vstack(avg_sent_len)
    avg_word_len = np.vstack(avg_word_len)
        
        
    return avg_word_len, avg_sent_len #returns feature vector of average word and sentence length

In [24]:
word_len_feature, sent_len_feature = avg_sent_word_length(text_pairs)

100%|██████████████████████████████████████████████████████████████████████████████| 1578/1578 [03:34<00:00,  7.36it/s]


Isolating function words

In [25]:
with open('data/function_words_clean.txt', "r") as fw:
    func_words = fw.read().split()

In [26]:
#This function isolates function words and return the feature (cosine similarity between texts in pair)
# along with the isolated functions words (for the function word proportion feature

def isolate_function_words(text_pairs, f_words): #data must be the text_pairs from load_files()
    fw_in_data = []
    
    fw_text_pairs = []
    for pair in tqdm(text_pairs):
        fw_text_pairs = []
        for text in pair: 
            function_words = []
            
            words = text.split() #split fanfic into words in list
            
            for word in words: 
                if word in f_words: #if the word is a function word
                    function_words.append(word)
                
            stringed_function_words = " ".join(function_words) #for each fanfic in a pair, makes FW a long string. 
            fw_text_pairs.append(stringed_function_words) 
            
        #append text pairs with only their function words
        fw_in_data.append(fw_text_pairs) 
        
    fw_corpus = create_corpus(fw_in_data)  
    FW_matrix, fw_dateframe = fit_tfidf(fw_corpus) #vectorize
    fw_matrix = FW_matrix.toarray()
    
    feature = []

    for i in range(len(text_pairs)):
        cos_sim = cosine_sim(fw_matrix[2*i], fw_matrix[2*i+1]) #compute similarity 
        feature.append(cos_sim)
        
    feature = np.vstack(feature) #final feature stack
    
        
    return feature, fw_in_data #returns feature vector and each text pair with only their function words


In [27]:
function_words_feature, fw_in_data = isolate_function_words(text_pairs, func_words)

100%|██████████████████████████████████████████████████████████████████████████████| 1578/1578 [00:56<00:00, 27.91it/s]


training vectorizer... (^◡^ )
vectorizer fit! OwO


Function words divided by total no. tokens

In [28]:
def function_words_proportion(text_pair_original, text_pair_fw): 
    fw_by_total = []
    
    for i, pair in enumerate(text_pair_original): 
        fw_by_total_pair = []
        for ind, text in enumerate(pair): 
            fw_length = len(text_pair_fw[i][ind]) #length of text counting only function words
            orig_length = len(text_pair_original[i][ind]) #length of text counting all tokens
            proportion = fw_length/orig_length #divide occurance of function words by all tokens
            fw_by_total_pair.append(proportion)
        fw_by_total.append(fw_by_total_pair)
        
        
    feature = []

    for pair in fw_by_total: 
        feat = pair[0] - pair[1]
        feature.append(np.abs(feat))

    feature = np.vstack(feature)
    
    
    return feature


In [29]:
fw_proportion_feature = function_words_proportion(text_pairs, fw_in_data)

Isolating profanity

In [30]:
data = 'data/modified/train_pair.jsonl'
with open('data/profanity_words_clean.txt', "r") as pr:
    prof_words = pr.read().split()
del prof_words[:4]
del prof_words[2]

In [31]:
data = 'data/modified/train_pair.jsonl'

In [34]:
def isolate_profanity(data, prof_words):
    profanity = []
    
    for pair in tqdm(data):
        profanity_pairs = []
        for text in pair:
            resultwords = []

            #d = json.loads(line.strip())
            #text = d.get("pair") 
            words = text.split() 

            resultwords  = [word for word in words if word.lower() in prof_words]

            result = " ".join(resultwords)
            profanity_pairs.append(result)
        
        profanity.append(profanity_pairs) 
        
    profanity_corpus = create_corpus(profanity)
    profanity_matrix, profanity_dataframe = fit_tfidf(profanity_corpus)
    profanity_matrix = profanity_matrix.toarray()
    
    feature = []

    for i in range(len(text_pairs)):
        cos_sim = cosine_sim(profanity_matrix[2*i], profanity_matrix[2*i+1])
        feature.append(cos_sim)
        
    feature = np.vstack(feature)

        
    return feature

In [35]:
profanity_feature = isolate_profanity(text_pairs, prof_words)

100%|██████████████████████████████████████████████████████████████████████████████| 1578/1578 [02:11<00:00, 11.98it/s]

training vectorizer... :-)
vectorizer fit! ( ͡❛ ͜ʖ ͡❛)





Yule's K computations - different implementations

(a) Our own implementation - delete

In [None]:
def tokenize_no_symbols(text):
    return nltk.word_tokenize(re.sub(r'[^\w]', ' ', text))

def get_fdist_yule(text):
    text = tokenize_no_symbols(text)
    fdist = word_freq_single(text)
    return fdist
        
def get_num_unique_words(text):
    text = tokenize_no_symbols(text.lower())
    return len(set(text))

(c) Implementation below from: https://swizec.com/blog/measuring-vocabulary-richness-with-python/

In [37]:
from nltk.stem.porter import PorterStemmer
from itertools import groupby

def words(entry):
    return filter(lambda w: len(w) > 0,
                  [w.strip("0123456789!:,.?(){}[]") for w in entry.split()])

def yule(entry):
    # yule's I measure (the inverse of yule's K measure)
    # higher number is higher diversity - richer vocabulary
    d = {}
    stemmer = PorterStemmer()
    for w in words(entry):
        w = stemmer.stem(w).lower()
        try:
            d[w] += 1
        except KeyError:
            d[w] = 1

    M1 = float(len(d))
    M2 = sum([len(list(g))*(freq**2) for freq,g in groupby(sorted(d.values()))])

    try:
        return (M1*M1)/(M2-M1)
    except ZeroDivisionError:
        return 0

In [38]:
yules_i_feature = []

for i in tqdm(range(len(text_pairs))):
    yules_i = yule(corpus[2*i]) - yule(corpus[2*i+1]) #just changed 2*1 to 2*i - rerun for better feature. 
    yules_i_feature.append(np.abs(yules_i))
    
yules_i_feature = np.vstack(yules_i_feature)

100%|██████████████████████████████████████████████████████████████████████████████| 1578/1578 [06:39<00:00,  3.95it/s]


Misspellings

In [39]:
# from spellchecker import SpellChecker

# def misspelled_words(text):
#     #Library for spell checking
#     spell = SpellChecker()
#     text = remove_symbols(text)
#     #Regex for finding digits
#     _digits = re.compile('\d')

#     #List of misspelled words
#     misspelled = spell.unknown(text.split())
#     #Remove words, that start with capital letter (Likely names)
#     no_names = [x for x in misspelled if x.title() not in text]
#     #Remove words that contain digits (7th)
#     no_digits = [x for x in no_names if not bool(_digits.search(x))]
    
#     #Find corrections for misspelled words - if word is more than a single character.
#     corrections = [spell.correction(x) for x in no_digits if len(x)>1]
#     #Remove corrections, if they have no correction (likely misclassified spelling mistake)
#     remove_no_correction = [x for x in corrections if x not in misspelled]
#     return remove_no_correction

# misspelled_words(corpus[0])

In [40]:
# misspellings_feature = []

# for i in tqdm(range(len(text_pairs))):
#     num_of_misspellings = len(misspelled_words(corpus[2*1])) - len(misspelled_words(corpus[2*i+1]))
#     misspellings_feature.append(np.abs(num_of_misspellings))
    
# misspellings_feature = np.vstack(misspellings_feature)

Character n-grams

In [51]:
def create_character_ngram(n, corpus, text_pairs):
    #Will return matrix where each row is a pair of texts
    vectorizer = TfidfVectorizer(max_features=3000, analyzer='char',ngram_range=(n, n))
    print("training vectorizer...",rand_emot())
    char_ngram = vectorizer.fit_transform(corpus).toarray()
    print("vectorizer fit!", rand_emot())
    
    feature = []
    
    for i in tqdm(range(len(text_pairs))):
        cos_sim = cosine_sim(char_ngram[2*i], char_ngram[2*i+1])
        feature.append(cos_sim)
    
    feature = np.vstack(feature)
    
    
    return char_ngram, feature

In [52]:
character_ngram_matrix, char_bigram_feature = create_character_ngram(2, corpus, text_pairs)

training vectorizer... ( ◡́.◡̀)


100%|███████████████████████████████████████████████████████████████████████████| 1578/1578 [00:00<00:00, 26314.25it/s]

vectorizer fit! x)





In [53]:
char_bigram_feature

array([[0.97487565],
       [0.92723132],
       [0.94037656],
       ...,
       [0.96440691],
       [0.9299618 ],
       [0.91763796]])

Skip-gram

In [None]:
#Stolen from https://stackoverflow.com/questions/31847682/how-to-compute-skipgrams-in-python
from itertools import chain, combinations
import copy
from nltk.util import ngrams

corpus_tokens = [nltk.word_tokenize(x) for x in corpus]

def pad_sequence(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
    if pad_left:
        sequence = chain((pad_symbol,) * (n-1), sequence)
    if pad_right:
        sequence = chain(sequence, (pad_symbol,) * (n-1))
    return sequence

def skipgrams(sequence, n, k, pad_left=False, pad_right=False, pad_symbol=None):
    sequence_length = len(sequence)
    sequence = iter(sequence)
    sequence = pad_sequence(sequence, n, pad_left, pad_right, pad_symbol)

    if sequence_length + pad_left + pad_right < k:
        raise Exception("The length of sentence + padding(s) < skip")

    if n < k:
        raise Exception("Degree of Ngrams (n) needs to be bigger than skip (k)")    

    history = []
    nk = n+k

    # Return point for recursion.
    if nk < 1: 
        return
    # If n+k longer than sequence, reduce k by 1 and recur
    elif nk > sequence_length: 
        for ng in skipgrams(list(sequence), n, k-1):
            yield ng

    while nk > 1: # Collects the first instance of n+k length history
        history.append(next(sequence))
        nk -= 1

    # Iterative drop first item in history and picks up the next
    # while yielding skipgrams for each iteration.
    for item in sequence:
        history.append(item)
        current_token = history.pop(0)      
        # Iterates through the rest of the history and 
        # pick out all combinations the n-1grams
        for idx in list(combinations(range(len(history)), n-1)):
            ng = [current_token]
            for _id in idx:
                ng.append(history[_id])
            yield tuple(ng)

    # Recursively yield the skigrams for the rest of seqeunce where
    # len(sequence) < n+k
    for ng in list(skipgrams(history, n, k-1)):
        yield ng

Save features

In [54]:
import pickle

def save_features(feature_dict):
    '''Save the updated feature dictionary. Takes dictionary as input and saves as binary file
    
    example: 
    >>> my_featues = {'freqdist': [1,6,3,5]}
    >>> save_features(my_features)'''
    
    with open('data/features.dat', 'wb') as file:
        pickle.dump(feature_dict, file)
    print("Features saved! :-)")

def load_features():
    '''Load feature dictionary. Returns the saved feature as a dictionary.
    Will then print all the available features.
    
    example: 
    >>> my_features = load_features()'''
    
    with open('data/features.dat', 'rb') as file:
        feats = pickle.load(file)
    print("Features available:")
    for i in feats.keys():
        print(i)
    
    return feats


In [71]:
# feats = {
#     "Unique words / total no. tokens":richness_feature, 
#     "Punctuation / total no. tokens": punct_proportion_feature, 
#     "Function words freq. dist": function_words_feature, 
#     "Profanity freq. dist.": profanity_feature,
#     "Function words / total no. tokens": fw_proportion_feature,
#     "Average word length": word_len_feature, 
#     "Average sentence length": sent_len_feature, 
#     "Character bigrams": char_bigram_feature, 
#     "Yules I": yules_i_feature, 
#     "LIX": lix_feature
    
# }

In [93]:
#Order: fw distribution, profanity distribution, character bigram dist., unique words, punctuation proportion, word len, sent len, yules, lix
features = function_words_feature
features = np.hstack((features,profanity_feature))
features = np.hstack((features,char_bigram_feature))
features = np.hstack((features,richness_feature))
features = np.hstack((features,punct_proportion_feature))
features = np.hstack((features,fw_proportion_feature))
features = np.hstack((features,word_len_feature))
features = np.hstack((features,sent_len_feature))
features = np.hstack((features,yules_i_feature))
features = np.hstack((features,lix_feature))

len(features[:,0])


1578

In [94]:
features.dump("feature_matrix.dat")

# Classification

In [96]:
from sklearn import preprocessing

In [95]:
#Order: fw distribution, profanity distribution, character bigram dist., unique words, punctuation proportion, word len, sent len, yules, lix
feat_matrix = np.load("feature_matrix.dat", allow_pickle=True) 

In [100]:
feat_matrix

array([[0.92239211, 0.03173604, 0.97487565, ..., 0.57456802, 7.08319174,
        3.93799665],
       [0.75270986, 0.19542507, 0.92723132, ..., 2.14250999, 2.18477903,
        3.4817081 ],
       [0.92438127, 0.        , 0.94037656, ..., 0.68910175, 0.02321739,
        6.82064284],
       ...,
       [0.92556229, 0.        , 0.96440691, ..., 2.91769646, 5.55211268,
        2.76283631],
       [0.88020293, 0.        , 0.9299618 , ..., 3.09249311, 0.4328907 ,
        6.5335426 ],
       [0.7391699 , 0.        , 0.91763796, ..., 1.54155921, 2.08026352,
        2.29202822]])

In [101]:
normalized_scalars = preprocessing.normalize(feat_matrix[:,3:])

In [102]:
normalized_scalars

array([[5.48243305e-03, 1.39945661e-03, 5.66049004e-03, ...,
        7.07165896e-02, 8.71783918e-01, 4.84680109e-01],
       [1.58161882e-02, 3.43059535e-04, 3.25301484e-03, ...,
        4.61011045e-01, 4.70106216e-01, 7.49170784e-01],
       [5.20178098e-03, 6.84528522e-04, 2.08568491e-03, ...,
        1.00447669e-01, 3.38430795e-03, 9.94218452e-01],
       ...,
       [2.33274659e-03, 1.67510033e-03, 7.12070565e-03, ...,
        4.25660551e-01, 8.09993560e-01, 4.03068120e-01],
       [5.08806851e-03, 4.59425409e-03, 1.24115246e-02, ...,
        4.26917583e-01, 5.97604087e-02, 9.01953250e-01],
       [4.10044393e-04, 1.37274598e-03, 9.48787532e-04, ...,
        4.44507406e-01, 5.99842377e-01, 6.60904566e-01]])

In [105]:
features = np.hstack((feat_matrix[:,:2],normalized_scalars))

In [106]:
from sklearn.model_selection import train_test_split

In [107]:
X_train, X_test, y_train, y_test = train_test_split(feat_matrix, labels, test_size=0.2, random_state=42)

SVM

In [108]:
from sklearn import svm
from sklearn.metrics import accuracy_score

In [109]:
svm_clf = svm.SVC()

In [110]:
svm_clf.fit(X_train, y_train)

SVC()

In [111]:
prediction = svm_clf.predict(X_test)

In [112]:
accuracy_score(y_test, prediction)

0.7056962025316456

Random Forest

In [114]:
from sklearn.ensemble import RandomForestClassifier

In [115]:
random_forest_clf = RandomForestClassifier(random_state=42)

In [116]:
random_forest_clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [117]:
prediction = random_forest_clf.predict(X_test)

In [118]:
random_forest_clf.feature_importances_

array([0.22433032, 0.02607301, 0.14780122, 0.12592579, 0.08369665,
       0.091504  , 0.08243689, 0.0804171 , 0.07452721, 0.0632878 ])

In [119]:
accuracy_score(y_test, prediction) #random_state decreased the accuracy

0.7341772151898734

# Notes on feature combinations: (not current)

When train/test split is 80/20

Features = function words, profanity words, avg sentence length, avg word length, lix, yules i, number of misspellings difference


**All:** 
    SVM acc = **0.617**0886075949367,
    RF acc = **0.724**6835443037974 

**Without number of misspellings difference:** 
    SVM acc = **0.648**7341772151899,
    RF acc = **0.721**5189873417721
    
**Without lix:** 
    SVM acc = **0.655**0632911392406,
    RF acc = **0.727**8481012658228
    
**Without yules I:** 
    SVM acc = **0.645**5696202531646,
    RF acc = **0.712**0253164556962

**Without average sentence length:**
    SVM acc = **0.563**2911392405063,
    RF acc = **0.718**3544303797469

**Without average word length:**
    SVM acc = **0.642**4050632911392,
    RF acc = **0.674**0506329113924
    
**Without function words:** 
    SVM acc = **0.645**5696202531646,
    RF acc = **0.648**7341772151899
    
**Without profanity words:** 
    SVM acc = **0.645**5696202531646,
    RF acc = **0.699**3670886075949 

In [None]:
corpus[1426]