Jeremy's features

In [12]:
import pandas as pd
import numpy as np
import spacy
from textblob import TextBlob
import gensim
import re
from collections import Counter
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score
import warnings
import copy
from topic_modeling import get_topics
import pronouncing
import textstat

warnings.filterwarnings("ignore")
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('train.csv')

df.drop('id', axis=1, inplace=True)

new_cols = ['raw_text_length', 'num_words', 'avg_word_len', 'vector_avg', 'polarity', 'subjectivity', 'dale_chall', 'rhyme_frequency', 'FleischReadingEase', 'lexicon', 'word_diversity']
false_cols = ['starts_conj', 'ends_prep', 'has_colon', 'has_semicolon', 'has_dash', 'whom', 'has_had', 'num_ings']
empty_cols = ['entities', 'lemmas']
parts_of_speech = ['ADJ', 'ADV', 'ADP', 'AUX', 'CONJ', 'CCONJ', 'DET', 'EOL', 'NO_TAG', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']
for col in new_cols:
    df[col] = np.zeros(shape=(len(df), ))
for col in empty_cols:
    df[col] = np.empty(shape=(len(df), ))
for col in false_cols:
    df[col] = np.zeros(shape=(len(df),))
for pos in parts_of_speech:
    df[pos] = np.zeros(shape=(len(df),)).astype('uint8')
encoder = LabelEncoder()
df['author'] = encoder.fit_transform(df.author.values)
nlp = spacy.load("en_core_web_sm")

def rhyme_frequency(text):
    words = list(set([i.lower() for i in text.split(' ')]))
    num_words = len(words)
    num_rhymes = 0
    for i in words:
        other = copy.copy(words)
        other.remove(i)
        rhymes = pronouncing.rhymes(i)
        matches = set(other).intersection(set(rhymes))
        num_rhymes += len(matches)/2
    return num_rhymes/num_words

def add_features(row):
    text = row.text
    doc = nlp(text)
    lemmas = list()
    entities = list()
    for token in doc:
        if token.text == ':':
            row['has_colon'] = 1
        if token.text == ';':
            row['has_semicolon'] = 1
        if token.text == '-':
            row['has_dash'] = 1
        if token.text.lower() == 'whom':
            row['whom'] = 1
        if token.text[-3:] == 'ing':
            row['num_ings'] += 1
        if token.text.lower() == 'had':
            row['has_had'] = 1
        pos = token.pos_
        row[pos] += 1
        if token.is_stop or not token.is_alpha:
            continue
        lemma = token.lemma_.strip().lower()
        if lemma:
            lemmas.append(lemma)
    for ent in doc.ents:
        entities.append(ent.text)
    lemmas = ' '.join(lemmas)
    blob = TextBlob(text)
    row['subjectivity'] = blob.sentiment.subjectivity
    row['polarity'] = blob.sentiment.polarity
    row['starts_conj'] = int(doc[0].pos_ == 'CONJ')
    row['ends_prep'] = int(doc[0].pos_ == 'PREP')
    row['entities'] = entities
    row['lemmas'] = lemmas
    row['raw_text_length'] = len(text)
    row['num_words'] = len(doc)
    row['avg_word_len'] = row.raw_text_length / row.num_words
    row['vector_avg'] = np.mean(nlp(lemmas).vector)
    row['num_ings'] /= row['num_words']
    row['rhyme_frequency'] = rhyme_frequency(row['text'])
    row['dale_chall'] = textstat.dale_chall_readability_score(row['text'])
    row['FleischReadingEase'] = textstat.flesch_reading_ease(row['text'])
    row['lexicon'] = textstat.lexicon_count(row['text'])
    row['word_diversity'] = row.lexicon/row.num_words
    return row


In [13]:
# df = df.apply(lambda x: add_features(x), axis=1)
# df['FleischReadingEase'] = df['FleischReadingEase'] - df['FleischReadingEase'].min()

In [21]:
# df.to_csv('checkpoint.csv')

In [30]:
df = pd.read_csv('checkpoint.csv')

In [33]:
topics = get_topics(num_topics = 4, sentences = df.text.values.tolist())
dummies = pd.get_dummies(pd.DataFrame(topics))
df = pd.concat([df, dummies], axis=1)

X_train, X_val, y_train, y_val = train_test_split(df.drop(['author', 'lemmas', 'entities', 'topic'], axis=1), df.author.values, test_size=0.1, random_state=0)

cv = CountVectorizer()
cv.fit(np.concatenate([X_train.text, X_val.text]))
X_train_cv = cv.transform(X_train.text)
X_val_cv = cv.transform(X_val.text)

x_train_array = pd.DataFrame(X_train_cv.toarray())
x_val_array = pd.DataFrame(X_val_cv.toarray())
X_train_full = pd.concat([x_train_array, X_train], axis=1)
X_val_full = pd.concat([x_val_array, X_val], axis=1)

X_train_final = X_train_full.drop('text', axis=1)
X_val_final = X_val_full.drop('text', axis=1)


Remy's features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import pandas as pd
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.metrics import log_loss
from keras import models
from keras import layers
from keras import optimizers
import random
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from AutoCluster import AutoKMeans

In [None]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
X_train = df.text
y_train = df.author
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, random_state=0,test_size=.1)
X_test = test_df.text

In [None]:
#instantiate various
nb = MultinomialNB()
tk = Tokenizer(lower = True,num_words=5000)
tfidf = TfidfVectorizer(stop_words='english',max_features=12000)
svm = SVC(kernel='linear')
encoder = LabelEncoder()

In [None]:
#Tf-Idf with svm
X_tfidf_train = tfidf.fit_transform(X_train)
X_tfidf_test = tfidf.transform(X_test)
X_tfidf_val = tfidf.transform(X_val)
svm.fit(X_tfidf_train,y_train_le)
tfidf_train_preds =svm.predict(X_tfidf_train)
tfidf_test_preds =svm.predict(X_tfidf_test)
tfidf_val_preds = svm.predict(X_tfidf_val)

In [None]:
#Tokenize text (255 characters, truncating post)
tk.fit_on_texts(df.text)
train_post = tk.texts_to_sequences(X_train)
X_train_post = pad_sequences(train_pre,255,truncating='post')
val_post = tk.texts_to_sequences(X_val)
X_val_post = pad_sequences(val_pre,255,truncating='post')
test_post = tk.texts_to_sequences(test_df.text)
X_test_post = pad_sequences(test_pre,255,truncating='post')

tk.fit_on_texts(X_train)
train_pre = tk.texts_to_sequences(X_train)
X_train_pre = pad_sequences(train_pre,255,truncating='pre')
val_pre = tk.texts_to_sequences(X_val)
X_val_pre = pad_sequences(val_pre,255,truncating='pre')
test_pre = tk.texts_to_sequences(X_test)
X_test_pre = pad_sequences(test_pre,255,truncating='pre')

#reformat y
y_train_le = encoder.fit_transform(y_train)
y_val_le = encoder.fit_transform(y_val)
y_train_dmy = pd.get_dummies(y_train)
y_val_dmy = pd.get_dummies(y_val)

In [None]:
#KMeans clustering
train_clusters_post, val_clusters_post, test_clusters_post = AutoKMeans(X_train_post,X_val_post,X_test_post,n_clusters=30)
train_clusters_pre, val_clusters_pre, test_clusters_pre = AutoKMeans(X_train_pre,X_val_pre,X_test_pre,n_clusters=30)

In [None]:
#Multinomial naive bayes
nb.fit(X_train_post,y_train_le)
nb_post_train_preds = nb.predict(X_train_post)
nb_post_test_preds = nb.predict(X_test_post)
nb_post_val_preds = nb.predict(X_val_post)
nb.fit(X_train_pre,y_train_le)
nb_pre_train_preds = nb.predict(X_train_pre)
nb_pre_test_preds = nb.predict(X_test_pre)
nb_pre_val_preds = nb.predict(X_val_pre)

In [None]:
#Combine outputs of models
train_le =pd.DataFrame({'NB_pre':nb_pre_train_preds,'NB_post':nb_post_train_preds,'TFIDF':tfidf_train_preds,\
                        'Cluster_post':train_clusters_post,'Cluster_pre':train_clusters_pre})
train_dmy = pd.DataFrame(None,index=train_le.index)
for col in train_le.columns:
    train_dmy = train_dmy.join(pd.get_dummies(train_le[col].astype(str),drop_first=True,prefix=col))        
val_le =pd.DataFrame({'NB_pre':nb_pre_val_preds,'NB_post':nb_post_val_preds,'TFIDF':tfidf_val_preds,\
                        'Cluster_post':val_clusters_post,'Cluster_pre':val_clusters_pre})
val_dmy = pd.DataFrame(None,index=val_le.index)
for col in val_le.columns:
    val_dmy = val_dmy.join(pd.get_dummies(val_le[col].astype(str),drop_first=True,prefix=col))
test_le =pd.DataFrame({'NB_pre':nb_pre_test_preds,'NB_post':nb_post_test_preds,'TFIDF':tfidf_test_preds,\
                        'Cluster_post':test_clusters_post,'Cluster_pre':test_clusters_pre})
test_dmy = pd.DataFrame(None,index=test_le.index)
for col in test_le.columns:
    test_dmy = test_dmy.join(pd.get_dummies(test_le[col].astype(str),drop_first=True,prefix=col))                                            

In [None]:
#bagged lstm for post tokens
num_words = 5000
embed_vec_len = 32
max_sequence_len = 255
lstm_nn = models.Sequential()
lstm_nn.add(layers.Embedding(num_words, embed_vec_len, input_length=max_sequence_len))
lstm_nn.add(layers.SpatialDropout1D(0.2))
lstm_nn.add(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_nn.add(layers.Dense(3, activation='softmax'))
checkpoint = ModelCheckpoint('255post.best.hdf5',  verbose=1, save_best_only=True, mode='auto')
lstm_nn.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
filepaths = []
counter = 0
while counter < 10:
    print(counter+1)
    sub = random.sample(range(len(X_train_post)),k=5000)
    filepath = '255post'+str(counter)+'.best.hdf5'
    filepaths.append(filepath)
    checkpoint = ModelCheckpoint(filepath,  verbose=1, save_best_only=True, mode='auto')
    lstm_nn.fit(X_train_post[sub],y_train_dmy.iloc[sub,:],epochs=5,batch_size=256, validation_data=(X_val_post,y_val_dmy), callbacks=[checkpoint])
    counter += 1
nn_val_preds = []
nn_train_preds = []
nn_test_preds = []
for filepath in tqdm.tqdm(filepaths):
    lstm_nn = models.Sequential()
    lstm_nn.add(layers.Embedding(num_words, embed_vec_len, input_length=max_sequence_len))
    lstm_nn.add(layers.SpatialDropout1D(0.2))
    lstm_nn.add(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    lstm_nn.add(layers.Dense(3, activation='softmax'))
    lstm_nn.load_weights(filepath)
    lstm_nn.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    nn_train_preds.append(lstm_nn.predict(X_train_post))
    nn_val_preds.append(lstm_nn.predict(X_val_post))
    nn_test_preds.append(lstm_nn.predict(X_test_post))
nn_train_post_mean = np.asarray(nn_train_preds).mean(axis=0)[:,1:]
nn_test_post_mean = np.asarray(nn_test_preds).mean(axis=0)[:,1:]
nn_val_post_mean = np.asarray(nn_val_preds).mean(axis=0)[:,1:]

In [None]:
#bagged lstm for pre tokens
num_words = 5000
embed_vec_len = 32
max_sequence_len = 255
lstm_nn = models.Sequential()
lstm_nn.add(layers.Embedding(num_words, embed_vec_len, input_length=max_sequence_len))
lstm_nn.add(layers.SpatialDropout1D(0.2))
lstm_nn.add(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_nn.add(layers.Dense(3, activation='softmax'))
checkpoint = ModelCheckpoint(filepath,  verbose=1, save_best_only=True, mode='auto')
lstm_nn.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
filepaths = []
counter = 0
while counter < 10:
    print(counter+1)
    sub = random.sample(range(len(X_train_pre)),k=5000)
    filepath = '255pre'+str(counter)+'.best.hdf5'
    filepaths.append(filepath)
    checkpoint = ModelCheckpoint(filepath,  verbose=1, save_best_only=True, mode='auto')
    lstm_nn.fit(X_train_pre[sub],y_train_dmy.iloc[sub,:],epochs=3,batch_size=256, validation_data=(X_val_pre,y_val_dmy), callbacks=[checkpoint])
    counter += 1
nn_val_preds = []
nn_train_preds = []
nn_test_preds = []
for filepath in tqdm.tqdm(filepaths):
    lstm_nn = models.Sequential()
    lstm_nn.add(layers.Embedding(num_words, embed_vec_len, input_length=max_sequence_len))
    lstm_nn.add(layers.SpatialDropout1D(0.2))
    lstm_nn.add(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    lstm_nn.add(layers.Dense(3, activation='softmax'))
    lstm_nn.load_weights(filepath)
    lstm_nn.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    nn_train_preds.append(lstm_nn.predict(X_train_pre))
    nn_val_preds.append(lstm_nn.predict(X_val_pre))
    nn_test_preds.append(lstm_nn.predict(X_test_pre))
nn_train_pre_mean = np.asarray(nn_train_preds).mean(axis=0)[:,1:]
nn_test_pre_mean = np.asarray(nn_test_preds).mean(axis=0)[:,1:]
nn_val_pre_mean = np.asarray(nn_val_preds).mean(axis=0)[:,1:]

In [None]:
nlp_df_train = train_dmy.join(pd.DataFrame(nn_train_pre_mean,columns=('LSTM_pre_1','LSTM_pre_2')))
nlp_df_val.csv = val_dmy.join(pd.DataFrame(nn_val_pre_mean,columns=('LSTM_pre_1','LSTM_pre_2')))
test_nlp_df = test_dmy.join(pd.DataFrame(nn_test_pre_mean,columns=('LSTM_pre_1','LSTM_pre_2')))

In [None]:
train_nlp_df.to_csv('nlp_df_train.csv')
val_nlp_df.to_csv('nlp_df_val.csv')
test_nlp_df.to_csv('nlp_df_test.csv')