# Classification notebook
### No feature extraction here - will load from pickle file

In [19]:
import pickle
import numpy as np
import pandas as pd
import json
import random
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

In [20]:
def rand_emot():
    e = ["(o_o)",":-)",":P",":D","x)","ᓚᘏᗢ","╯°□°）╯︵ ┻━┻",":)",
         "*<:-)","^_^","(⌐■_■)","¯\_(ツ)_/¯", "(T_T)",":o","OwO",
        "( ͡❛ ͜ʖ ͡❛)","(̶◉͛‿◉̶)","( ≖.≖)","(ㆆ_ㆆ)","ʕ•́ᴥ•̀ʔっ","( ◡́.◡̀)","(^◡^ )"]
    return random.choice(e)

def load_files():
    text_pairs = [] #Would be nice to have as np.array
    labels = []
    fandom = []
    
    pair_id = []
    true_id = []
    
    #Load truth JSON
    for line in open('data/modified/train_truth.jsonl'):
        d = json.loads(line.strip())
        labels.append(int(d['same']))
        true_id.append(d['id'])

    #Load actual fanfic.
    print("loading fanfic...",rand_emot())
    for line in tqdm(open('data/modified/train_pair.jsonl')):
        d = json.loads(line.strip())
        text_pairs.append(d['pair'])
        fandom.append(d['fandoms'])
        pair_id.append(d['id'])

    print("done loading",rand_emot())
    
    return text_pairs, labels, fandom, pair_id, true_id

In [21]:
_, labels, _, _, _ = load_files()
labels.pop(713)
y = labels

133it [00:00, 1326.49it/s]

loading fanfic... :P


1578it [00:00, 1648.45it/s]

done loading OwO





In [26]:
def save_features(feature_dict, filename='features'):
    '''Save the updated feature dictionary. Takes dictionary as input and saves as binary file
    
    example: 
    >>> my_featues = {'freqdist': [1,6,3,5]}
    >>> save_features(my_features)'''
    
    with open('data/{}.dat'.format(filename), 'wb') as file:
        pickle.dump(feature_dict, file)
    print("Features saved! :-)")

def load_features(filename='features'):
    '''Load feature dictionary. Returns the saved feature as a dictionary.
    
    example: 
    >>> my_features = load_features()'''
    
    with open('data/{}.dat'.format(filename), 'rb') as file:
        feats = pickle.load(file)
    print("Features available:")
    for i in feats.keys():
        print(i)
    return feats

In [33]:
feats = load_features()
feats2 = load_features(filename="feats_0706")

Features available:
function_words_freq_dist
character_bigram
skip_bigram
pos_skipgram
function_words
profanity_words
avg_sent_length
avg_word_length
lix
yule_i
num_misspellings
Features available:
character_bigram
skip_bigram
pos_skipgram
pos_bigram
character_bigram_cossim
skip_bigram_cossim
pos_skipgram_cossim
pos_bigram_cossim


In [34]:
for i in feats:
    if len(feats[i].shape)==1:
        feats[i] = feats[i][:,None]
    print(i, "\t", feats[i].shape)

for i in feats2:
    if len(feats2[i].shape)==1:
        feats2[i] = feats2[i][:,None]
    print(i, "\t", feats2[i].shape)

function_words_freq_dist 	 (1577, 259)
character_bigram 	 (1577, 6000)
skip_bigram 	 (1577, 8000)
pos_skipgram 	 (1577, 8000)
function_words 	 (1577, 1)
profanity_words 	 (1577, 1)
avg_sent_length 	 (1577, 1)
avg_word_length 	 (1577, 1)
lix 	 (1577, 1)
yule_i 	 (1577, 1)
num_misspellings 	 (1577, 1)
character_bigram 	 (1577, 6000)
skip_bigram 	 (1577, 8000)
pos_skipgram 	 (1577, 2208)
pos_bigram 	 (1577, 6000)
character_bigram_cossim 	 (1577, 1)
skip_bigram_cossim 	 (1577, 1)
pos_skipgram_cossim 	 (1577, 1)
pos_bigram_cossim 	 (1577, 1)


## Stratified K-Fold

In [7]:
def strat_kfold(model, X, k):
    y = np.array(labels)
    accuracies = []
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=69)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        accuracies.append(model.score(X_test, y_test))
    
    return np.mean(accuracies)

def combine_features(features):
    #List of features as input
    return np.hstack((features))

In [37]:
#Stratified 2-Fold accuracy on ALL Features
rfc = RandomForestClassifier(n_estimators=500, random_state=42)
all_features = combine_features([feats[x] for x in feats])
scalar_features = combine_features([feats[x] for x in feats if feats[x].shape[1]==1])
forced_scalars = []
for i in feats:
    if feats[i].shape[1] >1:
        forced_scalars.append(np.mean(feats[i], axis=1)[:,None])
    else:
        forced_scalars.append(feats[i])

forced_scalars = combine_features(forced_scalars)
vector_features = combine_features([feats['function_words_freq_dist'],
                                  feats['character_bigram'],
                                  feats['skip_bigram'],
                                  feats['pos_skipgram']])

final_features = combine_features([feats['function_words'],
                                 feats['profanity_words'],
                                 feats['avg_sent_length'],
                                 feats['avg_word_length'],
                                 feats['lix'],
                                 feats['yule_i'],
                                 feats['num_misspellings'],
                                 feats2['character_bigram_cossim'],
                                 feats2['skip_bigram_cossim'],
                                 feats2['pos_skipgram_cossim'],
                                 feats2['pos_bigram_cossim']])

print("All features:",all_features.shape)
print("Scalar features:", scalar_features.shape)
print("Forced scalar features:", forced_scalars.shape)
print("Vector features:", vector_features.shape)
print("Final features:", final_features.shape)

All features: (1577, 22266)
Scalar features: (1577, 7)
Forced scalar features: (1577, 11)
Vector features: (1577, 22259)
Final features: (1577, 11)


In [None]:
character_bigram_cossim 	 (1577, 1)
skip_bigram_cossim 	 (1577, 1)
pos_skipgram_cossim 	 (1577, 1)
pos_bigram_cossim 	 (1577, 1)

## Random Forest

In [9]:
strat_kfold(rfc, all_features, 2)

0.6246075479467037

In [10]:
#Stratified 5-Fold on the 8k best Random Forest Features
rfc_8k_feats = all_features[:,rfc.feature_importances_.argsort()[-8000:][::-1]]
strat_kfold(rfc, rfc_8k_feats, 5)

0.6740787623066105

In [11]:
strat_kfold(rfc, vector_features, 5)

0.6081173397629094

In [12]:
strat_kfold(rfc, forced_scalars, 5)

0.7323950170785614

In [38]:
strat_kfold(rfc, final_features, 5)

0.776158328310227

In [13]:
X_train, X_test, y_train, y_test = train_test_split(forced_scalars, labels)

dist = {
    'n_estimators' : [100,200,300,500,1000],
    'min_samples_split': [2,4,6,8],
    'min_samples_leaf': [1,2,3,4,5],
}
random_search = RandomizedSearchCV(rfc, dist, n_iter=20, random_state=42)
search = random_search.fit(X_train, y_train)
params = search.best_params_
search.score(X_test, y_test)

0.7164556962025317

In [14]:
strat_kfold(search, forced_scalars, 5)

0.7374643359453487

## MLP Classifier

In [15]:
mlp = MLPClassifier(max_iter=2000)
print(strat_kfold(mlp, all_features, 2))
print(strat_kfold(mlp, scalar_features, 2))
print(strat_kfold(mlp, forced_scalars, 2))
print(strat_kfold(mlp, vector_features, 2))

0.6467971730584883
0.6785053688727618
0.681050677783997
0.5941659750503432


In [16]:
X_train, X_test, y_train, y_test = train_test_split(forced_scalars, labels)

dist = {
    'hidden_layer_sizes' : [[200,200], [500], [256,128,64,32], [1000], [500,500]],
    'activation': ['relu', 'tanh', 'logistic', 'identity'],
    'solver': ['adam', 'sgd', 'lbfgs'],
    'learning_rate': ['adaptive', 'constant', 'invscaling']
    
}
random_search = RandomizedSearchCV(mlp, dist, n_iter=20, random_state=42)
search = random_search.fit(X_train, y_train)
params = search.best_params_
search.score(X_test, y_test)

0.7037974683544304

In [18]:
strat_kfold(search, forced_scalars, 3)

0.6734480053111233