In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.naive_bayes
import sklearn.ensemble
import sklearn.tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
tf.config.experimental_run_functions_eagerly(True)
from sklearn.metrics import accuracy_score

Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


In [2]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
n_sentences_train = len(df)
n_sentences_test = len(df_test)
print("Number of sentences in training set: {:d}".format(n_sentences_train))
print("Number of sentences in test set: {:d}".format(n_sentences_test))

Number of sentences in training set: 19579
Number of sentences in test set: 8392


In [5]:
df['author_id'] = df['author'].factorize()[0]

In [6]:
def make_xy(var_col, tar_col, vectorizer=None, train=True): 
    if vectorizer is None:
        vectorizer = CountVectorizer()
    if train == True:
        X = vectorizer.fit_transform(var_col)
    else:
        X = vectorizer.transform(var_col)
    X = X.tocsc()
    y = tar_col
    return X, y, vectorizer

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['author_id'], test_size = 0.3, random_state=42)
X_train, y_train, vec = make_xy(X_train, y_train)
X_test, y_test, vec = make_xy(X_test, y_test, vec, train=False)

In [8]:
X_train.shape
X_test.shape

(5874, 21920)

In [9]:
model = sklearn.naive_bayes.MultinomialNB()
model.fit(X_train, y_train)
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)
print("Training, accuracy score: {}".format(sklearn.metrics.accuracy_score(y_train, train_predictions)))
print("Testing, accuracy score: {}".format(sklearn.metrics.accuracy_score(y_test, test_predictions)))

Training, accuracy score: 0.916818679314119
Testing, accuracy score: 0.8329928498467825


In [10]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(X_train, y_train)
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)
print("Training, accuracy score: {}".format(sklearn.metrics.accuracy_score(y_train, train_predictions)))
print("Testing, accuracy score: {}".format(sklearn.metrics.accuracy_score(y_test, test_predictions)))

Training, accuracy score: 0.9846771251368114
Testing, accuracy score: 0.8122233571671774




In [11]:
model = sklearn.ensemble.RandomForestClassifier()
model.fit(X_train, y_train)
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)
print("Training, accuracy score: {}".format(sklearn.metrics.accuracy_score(y_train, train_predictions)))
print("Testing, accuracy score: {}".format(sklearn.metrics.accuracy_score(y_test, test_predictions)))



Training, accuracy score: 0.9862094126231302
Testing, accuracy score: 0.6251276813074566


In [12]:
model = sklearn.tree.DecisionTreeClassifier()
model.fit(X_train, y_train)
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)
print("Training, accuracy score: {}".format(sklearn.metrics.accuracy_score(y_train, train_predictions)))
print("Testing, accuracy score: {}".format(sklearn.metrics.accuracy_score(y_test, test_predictions)))

Training, accuracy score: 1.0
Testing, accuracy score: 0.5478379298604018


In [13]:
model = sklearn.linear_model.RidgeClassifier()
model.fit(X_train, y_train)
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)
print("Training, accuracy score: {}".format(sklearn.metrics.accuracy_score(y_train, train_predictions)))
print("Testing, accuracy score: {}".format(sklearn.metrics.accuracy_score(y_test, test_predictions)))

Training, accuracy score: 0.9334549434512951
Testing, accuracy score: 0.8074565883554647


In [None]:
sentences = df['text'].values
author = to_categorical(df['author_id'])
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, author, test_size = 0.3, random_state=42)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(sentences_train[1])
print(X_train[1])

In [None]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[0, :])

In [None]:
embedding_dim = 8
num_filters = 16
kernel_size = 20

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
#model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
#model.add(layers.GlobalMaxPooling1D())
model.add(layers.LSTM(4, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=3, verbose=True, validation_data=(X_test, y_test), batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
print(y_train)

In [None]:
model.fit(X_train, y_train, epochs=5)

In [None]:
results = model.predict(X_test)

In [None]:
print(results)

In [None]:
result = np.argmax(results, axis=1)

In [None]:
y_test = np.argmax(y_test, axis=1)

In [None]:
print(result)
print(y_test)

In [None]:
accuracy = accuracy_score(result, y_test)

In [None]:
print(accuracy)