In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle

In [2]:
train_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/train.csv')
test_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/test.csv')

train_df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [None]:
"""
Tokenization - Segregation of the text into its individual constitutent words.

Stopwords - Throw away any words that occur too frequently as its frequency of occurrence will not be 
            useful in helping detecting relevant texts. (as an aside also consider throwing away words 
            that occur very infrequently).
            
Stemming - combine variants of words into a single parent word that still conveys the same meaning

Vectorization - Converting text into vector format. One of the simplest is the famous bag-of-words approach, 
                where you create a matrix (for each document or text in the corpus). In the simplest form, 
                this matrix stores word frequencies (word counts) and is oft referred to as vectorization of the raw text.
"""

In [3]:
# Stemmers remove morphological affixes from words, leaving only the word stem.
stemmer = nltk.stem.SnowballStemmer(language='english')
sentences = []
for text in train_df['text']:
    tokens = []
    
    # Segregation of the text into its individual constitutent words.
    for token in nltk.word_tokenize(text):
        tokens.append(stemmer.stem(token.lower()))
    sentences.append(' '.join(tokens))

In [31]:
sentences[0]

'this process , howev , afford me no mean of ascertain the dimens of my dungeon ; as i might make it circuit , and return to the point whenc i set out , without be awar of the fact ; so perfect uniform seem the wall .'

In [4]:
test_sentences = []
for text in test_df['text']:
    tokens = []
    for token in nltk.word_tokenize(text):
        tokens.append(stemmer.stem(token.lower()))
    test_sentences.append(' '.join(tokens))

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Embedding
max_features = 20000
maxlen = 100

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(sentences + test_sentences)

Using TensorFlow backend.


In [33]:
# Class for vectorizing texts, or/and turning texts into sequences
sequences = tokenizer.texts_to_sequences(sentences)

# pad the sequence to maxlen 
# if the sequence shorter than the maxlen, pad with 0 at the end
sequences = pad_sequences(sequences, maxlen=maxlen,padding='post', truncating='post')

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_sequences = pad_sequences(test_sequences, maxlen=maxlen,padding='post', truncating='post')


# Pickle is used for serializing and de-serializing a Python object structure.
# Any object in python can be pickled so that it can be saved on disk. 
with open('keras_input_train.pkl', 'wb') as f:
    pickle.dump(sequences, f)
with open('keras_input_test.pkl', 'wb') as f:
    pickle.dump(test_sequences, f)

In [40]:
# dictionary mapping words (str) to their rank/index (int) 
vocab = tokenizer.word_index.items()