Import required things

In [17]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Open the data and make a dataframe with the texts and labels

In [14]:
data = open('NLPLearning/Vidhya/corpus',encoding='utf8').read()

In [15]:
labels, texts =[],[]
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(content[1])
    
#Dataframe
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

Split the dataset into training and validation using the inbuilt functions to do so

In [16]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

#Label and encode target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

Count vectors are a matrix notation of the dataset. Every row represents a document in corpus, every column a term from corpus and each cell the count of a term in a particular document.

In [18]:
#Make count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

#transform the training and dataset using the count vectorizer object
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

 TF-IDF score represents the relative importance of a term in the document and the entire corpus. TF-IDF score is composed by two terms: the first computes the normalized Term Frequency (TF), the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

TF-IDF Vectors can be generated at different levels of input tokens (words, characters, n-grams)

a. Word Level TF-IDF : Matrix representing tf-idf scores of every term in different documents <br>
b. N-gram Level TF-IDF : N-grams are the combination of N terms together. This Matrix representing tf-idf scores of N-grams<br>
c. Character Level TF-IDF : Matrix representing tf-idf scores of character level n-grams in the corpus

In [20]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [21]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [23]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

A form of representing words and documents using dense vector representation. The position of the word in the vector space is learned from text and based on the words surrounding the word when it is used. <br> The word embeddings may be trained using the input corpus or generated from pre trained embeddings such as <b> Glove, FastText, Word2Vec</b>. Any of these can be downloaded and used to transfer learning.

4 Stages
1. Load the pretrained word embeddings <br>
2. Create the tokenizer object <br>
3. Transform text docs to sequences of tokens and pad them <br>
4. Create a mapping of token and their respective embeddings 

In [29]:
#Load the pretrained vectors
embeddings_index = {}
for i, line in enumerate(open('NLPLearning/Vidhya/wiki-news-300d-1M.vec', encoding='utf8')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')
    

In [31]:
#create tokenizer
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

#convert text to sequence of tokens and pad them to ensure equal length vectors
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

#create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

A number of extra text based features can also be created which sometimes are helpful for improving text classification models. Some examples are:

* Word Count of the documents – total number of words in the documents
* Character Count of the documents – total number of characters in the documents
* Average Word Density of the documents – average length of the words used in the documents
* Puncutation Count in the Complete Essay – total number of punctuation marks in the documents
* Upper Case Count in the Complete Essay – total number of upper count words in the documents
* Title Word Count in the Complete Essay – total number of proper case (title) words in the documents
* Frequency distribution of Part of Speech Tags:
-       Noun Count
-       Verb Count
-       Adjective Count
-       Adverb Count
-       Pronoun Count
These features are highly experimental ones and should be used according to the problem statement only.