In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Dropout
import nltk

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('dataset/Gungor_2018_VictorianAuthorAttribution_data-train.csv', encoding='latin-1')
train_data = df
df

Unnamed: 0,text,author
0,ou have time to listen i will give you the ent...,1
1,wish for solitude he was twenty years of age a...,1
2,and the skirt blew in perfect freedom about th...,1
3,of san and the rows of shops opposite impresse...,1
4,an hour s walk was as tiresome as three in a s...,1
...,...,...
53673,after surrounding and searching the whole plac...,50
53674,giant who could make a young earthquake or a w...,50
53675,waters of the lake at the bottom of the hill c...,50
53676,fingers and thumb in it exactly as it came out...,50


In [3]:
lemmatiser = WordNetLemmatizer()
stop_words = stopwords.words('english')
cleaned_text = []
for text in train_data['text']:
    lemmatised_words =''
    split_text = text.split()
    for word in split_text:
        if word not in stop_words:
            lemmatised_word=lemmatiser.lemmatize(word, pos="v")
            lemmatised_words += lemmatised_word + ' '
    cleaned_text.append(lemmatised_words)
train_data['text no stopwords'] = cleaned_text
train_data

Unnamed: 0,text,author,text no stopwords
0,ou have time to listen i will give you the ent...,1,ou time listen give entire story say may form ...
1,wish for solitude he was twenty years of age a...,1,wish solitude twenty years age possession perf...
2,and the skirt blew in perfect freedom about th...,1,skirt blow perfect freedom upper part wear hat...
3,of san and the rows of shops opposite impresse...,1,san row shop opposite impress upon vision last...
4,an hour s walk was as tiresome as three in a s...,1,hour walk tiresome three sensible english town...
...,...,...,...
53673,after surrounding and searching the whole plac...,50,surround search whole place could discover tra...
53674,giant who could make a young earthquake or a w...,50,giant could make young earthquake angry accord...
53675,waters of the lake at the bottom of the hill c...,50,water lake bottom hill curl kindness sympathy ...
53676,fingers and thumb in it exactly as it came out...,50,finger thumb exactly come hand never mind say ...


In [4]:
#WITH STOPWORDS

In [5]:
X = df['text']
y = df['author'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.65, random_state=10)

In [6]:

vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)


In [7]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [8]:
y_pred = model.predict(X_test.toarray())

In [9]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.41512137800578947

In [10]:
y_predtrain = model.predict(X_train.toarray())

In [11]:
accuracyTrain = accuracy_score(y_train, y_predtrain)
accuracyTrain

0.986905839144089

In [12]:
#WITH STEMMING/Lemmatation and the removal of stopwords

In [13]:
X = df['text no stopwords']
y = df['author'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.65, random_state=10)

In [14]:

vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)


In [15]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [16]:
y_pred = model.predict(X_test.toarray())

In [17]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.43925367573299706

In [18]:
y_predtrain = model.predict(X_train.toarray())

In [19]:
accuracyTrain = accuracy_score(y_train, y_predtrain)
accuracyTrain

0.9597594081013466

In [20]:
#
#VECTORIZE ON ONLY TOP 30 WORDS WITHOUT STOPWORDS

In [22]:
vals = []
for i in range(len(df)):
    tokens = nltk.word_tokenize(df.iloc[0]['text no stopwords'])
    frequencies = nltk.FreqDist(tokens)
    most_common = ''
    for frequency in frequencies.most_common()[:30]:
        most_common += frequency[0] + ' '
    vals.append(most_common)
df['top text no stopwords'] = vals
df

Unnamed: 0,text,author,text no stopwords,top text no stopwords
0,ou have time to listen i will give you the ent...,1,ou time listen give entire story say may form ...,one time row give may city boat effect interes...
1,wish for solitude he was twenty years of age a...,1,wish solitude twenty years age possession perf...,one time row give may city boat effect interes...
2,and the skirt blew in perfect freedom about th...,1,skirt blow perfect freedom upper part wear hat...,one time row give may city boat effect interes...
3,of san and the rows of shops opposite impresse...,1,san row shop opposite impress upon vision last...,one time row give may city boat effect interes...
4,an hour s walk was as tiresome as three in a s...,1,hour walk tiresome three sensible english town...,one time row give may city boat effect interes...
...,...,...,...,...
53673,after surrounding and searching the whole plac...,50,surround search whole place could discover tra...,one time row give may city boat effect interes...
53674,giant who could make a young earthquake or a w...,50,giant could make young earthquake angry accord...,one time row give may city boat effect interes...
53675,waters of the lake at the bottom of the hill c...,50,water lake bottom hill curl kindness sympathy ...,one time row give may city boat effect interes...
53676,fingers and thumb in it exactly as it came out...,50,finger thumb exactly come hand never mind say ...,one time row give may city boat effect interes...


In [24]:
X = df['top text no stopwords']
y = df['author'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.65, random_state=10)

In [25]:

vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)


In [26]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [27]:
y_pred = model.predict(X_test.toarray())

  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)


In [28]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.017769625404832193

In [29]:
y_predtrain = model.predict(X_train.toarray())

  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)


In [30]:
accuracyTrain = accuracy_score(y_train, y_predtrain)
accuracyTrain

0.015542662479374036

In [31]:
df.to_csv('datafinal.csv')

KERAS : DEEP LEARNING

In [None]:
#CHANGE
input_dim = X_train.shape[1]
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
#CHANGE
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
#CHANGE
histby = model.fit(X_train, y_train,epochs=10, verbose=False,validation_data=(X_test, y_test),batch_size=10)


In [None]:
clear_session()

In [None]:
#CHANGE
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
#CHANGE
port matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()