In [3]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
import gensim
import re



In [2]:
#Our first approach to using Deep Learning to predict the author of a text will be to use a shallow neural network 
#with non-linear activation functions

In [4]:
df = pd.read_csv(r'Data/dataset.csv',names=['Author','Text'])

In [5]:
df['Text'] = df['Text'].map(lambda x: re.sub('\r|\n|\'','',x))
df['Text'] = df['Text'].map(lambda x: re.sub(r'--\d\d\d-\d\d\d-\d\d\d\d','',x))

In [6]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words_subset = set([x for x in stop_words if 3 <= len(x) <= 5])

In [7]:
text_copy = df['Text'].copy()

In [8]:
def remove_stopwords(x): #x is the string input
    word_set = x.split(' ')
    for word in word_set:
        if word in stop_words:
            word_set.remove(word)
    return ' '.join(word_set)
            


In [9]:
# Removing stopwords
df['Text'] = df['Text'].map(lambda x: remove_stopwords(x))

In [10]:
df['Text']

0       U.S. Senators Tuesday sharply criticized new S...
1       Two members Congress criticised Federal Reserv...
2       Commuters stuck traffic Leesburg Pike Northern...
3       A broad coalition corporations went Capitol Hi...
4       On Internet, new products come go blink eye, t...
5       Legislators continued debate Wednesday one dif...
6       A top federal regulator Thursday urged banks b...
7       Congress revives debate encryption export poli...
8       Congress revives debate encryption export poli...
9       Federal bank regulators begun prodding U.S. fi...
10      Privacy advocates warned Wednesday Clinton adm...
11      The number banks charging non-customers using ...
12      The number banks charging non-customers using ...
13      The rapidly evolving market stored-value cards...
14      As new class financial derivative based credit...
15      The merger Bankers Trust New York Corp. Alex. ...
16      The merger Bankers Trust New York Corp. Alex. ...
17      The U.

In [11]:
#Stratified train-test split 

X = df['Text']
y = df['Author']

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25)

In [39]:
#Part 1: Obtain word embedding representation of the articles. 

In [13]:
#Using gensim's pre-trained Word2Vec model for obtaining word-embeddings for the text data:
from gensim.models import Word2Vec


In [14]:
sentences = ' '.join(X_train)
more_sentences = ' '.join(X_test)
sentences = sentences + more_sentences

In [25]:
model = Word2Vec(sentences,min_count = 10,window = 3, size = 100) #Taking too long to train

In [131]:
#model = gensim.models.KeyedVectors.load_word2vec_format("glove.6B.300d.txt", binary=False)
#Maybe use a pre-trained model

In [16]:
import sqlite3

connectionState = sqlite3.connect('gutenberg/gutenberg.sqlite3')
#cursor=connectionState.cursor()
data_df = pd.read_sql_query("SELECT * FROM articles", connectionState)
# dropping the id column
data_df = data_df.drop('id',axis=1)
# rename columns to author and text
data_df.columns = ['Author','Text']

In [17]:
data_df['Text'] = data_df['Text'].map(lambda x: re.sub('\r|\n|\'','',x))
data_df['Text'] = data_df['Text'].map(lambda x: re.sub(r'--\d\d\d-\d\d\d-\d\d\d\d','',x))
data_df['Text'] = data_df['Text'].map(lambda x: re.sub('\s+',' ',x))

In [18]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words_subset = set([x for x in stop_words if 3 <= len(x) <=5])

def remove_stopwords(x): #x is the string
    word_set = x.split(' ')
    for word in word_set:
        if word in stop_words:
            word_set.remove(word)
    return ' '.join(word_set)

In [19]:
data_df['Text'] = data_df['Text'].map(lambda x: remove_stopwords(x))

In [20]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

porter_stemmer = PorterStemmer()
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [21]:
data_df['Text'] = data_df['Text'].apply(stem_sentences)

In [27]:
X = df['Text']
y = df['Author']

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25)

In [29]:
from gensim.models import Word2Vec

In [30]:
sentences = ' '.join(X_train)

In [31]:
model = Word2Vec(sentences,min_count = 10,window = 3, size = 100)

In [32]:
f = open("gutenberg_word2vec_model.pickle", "wb")

In [33]:
import pickle

In [34]:
pickle.dump(model, f)

In [35]:
f.close()