# Source :
https://github.com/williamscott701/Information-Retrieval/blob/master/2.%20TF-IDF%20Ranking%20-%20Cosine%20Similarity%2C%20Matching%20Score/TF-IDF.ipynb

https://github.com/mayank408/TFIDF/blob/master/TFIDF.ipynb

In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words
from sklearn.model_selection import *

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

## Reading Random Data with labels

In [2]:
# load the dataset
data = open('data/corpus').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# create a dataframe using texts and lables
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = train_test_split(trainDF, trainDF['label'])


In [3]:
trainDF['text'][0:99]

0     Stuning even for the non-gamer: This sound tra...
1     The best soundtrack ever to anything.: I'm rea...
2     Amazing!: This soundtrack is my favorite music...
3     Excellent Soundtrack: I truly like this soundt...
4     Remember, Pull Your Jaw Off The Floor After He...
                            ...                        
94    Thank you for Releasing it!!!!!: I loved this ...
95    Very Not Worth Your Time: The book was wriiten...
96    Very fun and educational: Trains, shapes and p...
97    Ludicrous and silly: I remember getting this b...
98    Artistry: I think that the Deodato concerts ar...
Name: text, Length: 99, dtype: object

In [4]:
train_x.reset_index(inplace = True)
valid_x.reset_index(inplace = True)

train_x.drop(columns = ['index'],inplace= True)
valid_x.drop(columns = ['index'],inplace= True)
train_x

Unnamed: 0,text,label
0,a great cooooooooooooool album.: don't blive t...,__label__2
1,"The autobiography of ""EVERY"" Caribbean mother....",__label__2
2,"Really usable: With 120 knots/bends/etc, this ...",__label__2
3,"Spine Chilling, Awesome, and Grand!: I can not...",__label__2
4,"Buy something else: The dvd is a documentory, ...",__label__1
...,...,...
7495,heat therapy: I used this product a couple of ...,__label__1
7496,"Not her best...........: Before I begin, let m...",__label__1
7497,Great movie!: Great movie from my childhood! L...,__label__2
7498,This book changed the way I look at all books....,__label__2


In [5]:
## small data for testing
# docA = "The cat sat on my face"
# docB = "The dog sat on my bed"
# docC = "The cat sat on my face"
# docD = "The dog sat on my bed"
# data = pd.DataFrame()
# data['text'] = [docA,docB,docC,docD]
# data['label'] = ['a','b','a','b']
# data

## Class to generate TF-IDF-CF 

In [6]:
class get_tftdfcf():
    import math
    def __init__(self,original_data):
        self.wordSet = {}
        for doc_2 in original_data.text:
            doc_2 = doc_2.split(" ")
            self.wordSet = set(self.wordSet).union(set(doc_2))


    def convert_lower_case(self,data):
        return np.char.lower(data)

    def remove_stop_words(self,data):
        stop_words = stopwords.words('english')
        words = word_tokenize(str(data))
        new_text = ""
        for w in words:
            if w not in stop_words and len(w) > 1:
                new_text = new_text + " " + w
        return new_text
    def remove_punctuation(self,data):
        symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
        for i in range(len(symbols)):
            data = np.char.replace(data, symbols[i], ' ')
            data = np.char.replace(data, "  ", " ")
        data = np.char.replace(data, ',', '')
        return data
    def remove_apostrophe(self,data):
        return np.char.replace(data, "'", "")
    def stemming(self,data):
        stemmer= PorterStemmer()
        
        tokens = word_tokenize(str(data))
        new_text = ""
        for w in tokens:
            new_text = new_text + " " + stemmer.stem(w)
        return new_text
    def convert_numbers(self,data):
        tokens = word_tokenize(str(data))
        new_text = ""
        for w in tokens:
            try:
                w = num2words(int(w))
            except:
                a = 0
            new_text = new_text + " " + w
        new_text = np.char.replace(new_text, "-", " ")
        return new_text

    def preprocess(self,data):
        data = self.convert_lower_case(data)
        data = self.remove_punctuation(data) #remove comma seperately
        data = self.remove_apostrophe(data)
        data = self.remove_stop_words(data)
        data = self.convert_numbers(data)
        data = self.stemming(data)
        data = self.remove_punctuation(data)
        data = self.convert_numbers(data)
        data = self.stemming(data) #needed again as we need to stem the words
        data = self.remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
        data = self.remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
        return data

    def get_valid_data(self,data_frame):
        self.data_frame = data_frame
        self.word_count = []
        self.word_count_label = []
        self.tf_array = []
        self.idf_array = []
        self.tfidf_array = []

        return self.run_all()

    def get_train_data(self,data_frame):
        # if train_or_test == 'train':
        
        self.word_count = []
        self.word_count_label = []
        self.tf_array = []
        self.idf_array = []
        self.tfidf_array = []
        self.data_frame = data_frame
        
        return self.run_all()
        
    def run_all(self,):
        self.data_frame.reset_index(inplace= True)
        for i in range(len(self.data_frame)):
            self.data_frame.text[i] = self.preprocess(str(self.data_frame.text[i]))
        
        for label in self.data_frame.label.unique():
            data_to_process = self.data_frame[self.data_frame.label == label]
            data_to_process.drop(columns = ['label'])
            wordDict_label = dict.fromkeys(self.wordSet, 0) 
            for doc_3 in data_to_process.text:
                doc_3 = doc_3.split(" ")
                for word in doc_3:
                    try:
                        if isinstance(wordDict_label[word],int) and (word in self.wordSet):
                            wordDict_label[word]+=1
                    except :
                        pass

                wordDict_label['label'] = label
                wordDict_label['records'] = len(data_to_process)
            self.word_count_label.append(wordDict_label)
            # print(label , wordDict_label)
            
        for doc,label in zip(self.data_frame.text,self.data_frame.label):
            doc = doc.split(" ")
            wordDict = dict.fromkeys(self.wordSet, 0)
        
            for word in doc :
                try :
                    if word in self.wordSet:
                        wordDict[word]+=1
                except:
                    pass
            wordDict['label'] = label
            self.word_count.append(wordDict)

            tfDict = {}
            bowCount = len(doc)
            for word, count in wordDict.items():
                if  word != 'label' and (word in self.wordSet):
                    try :
                        tfDict[word] = math.log10(count/float(bowCount)+1.0)
                    except:
                        pass
                tfDict['label'] = label
            self.tf_array.append(tfDict)
            # print(wordDict)

        
        idfDict = {}
        N = len(self.data_frame)
        
        idfDict = dict.fromkeys(self.wordSet, 0)
        for doc in self.word_count:
            for word, val in doc.items():
                if word != 'label':
                    if val > 0 and (word in self.wordSet):
                        try:
                            idfDict[word] += 1
                        except:
                            pass

        for word, val in idfDict.items():
            if word  != 'label' and (word in self.wordSet):
                try : 
                    idfDict[word] = math.log10(N / float(val)) if val > 0 else 0
                except :
                    pass
        self.idf_array.append(idfDict)
        
        tf_df = pd.DataFrame(self.tf_array)
        cf_data = pd.DataFrame(self.word_count_label)

        for label_index in tf_df.label.unique():
            data_to_process = tf_df[tf_df['label'] == label_index]
            cf_data_to_process = cf_data[cf_data['label'] == label_index]
            cf_data_to_process = cf_data_to_process.drop(columns = ['label'])
            n_len = len(cf_data_to_process)
            if n_len == 1 :
                no_records_per_label = list(cf_data_to_process.records)[0]
                cf_data_to_process  = dict(cf_data_to_process.iloc[0,:])
            # print(no_records_per_label)
            for index,row in data_to_process.iterrows():
                # print(row)
                tf_dict = dict(row)
                tfidf = {}
                for word, val in tf_dict.items():
                    if word  != 'label' and (word in self.wordSet):
                        tfidf[word] = val*idfDict[word]*(cf_data_to_process[word]/no_records_per_label)
                        # print((cf_data_to_process[word]))
                self.tfidf_array.append(tfidf)
            # return tfidf

        return (self.word_count,self.tf_array,self.idf_array,self.tfidf_array,self.word_count_label)

    # return [row_list,wordDict]


## Create TF-IDF-CF from data

In [7]:
tfidfcf = get_tftdfcf(trainDF[0:99])
result = tfidfcf.get_train_data(train_x[0:99])
tfidfcf_data = pd.DataFrame(result[3])

result_valid = tfidfcf.get_valid_data(valid_x[0:99])
tfidfcf_data_valid = pd.DataFrame(result_valid[3])

tfidfcf_data.shape,tfidfcf_data_valid.shape

((99, 2909), (99, 2909))

In [8]:
train_x[0:99]

Unnamed: 0,text,label
0,great cooooooooooooool album dont blive rando...,__label__2
1,autobiographi everi caribbean mother jamaica ...,__label__2
2,realli usabl one hundr twenti knot bend etc p...,__label__2
3,spine chill awesom grand say enough nightwish...,__label__2
4,buy someth el dvd documentori alot talk littl...,__label__1
...,...,...
94,album star song listen hardli enjoy listen al...,__label__1
95,didnt work start indoor cat contract flea mon...,__label__1
96,excel book guid women divorc process thought ...,__label__2
97,watch receiv new rhomba excit open tri open g...,__label__1


In [9]:
tfidfcf_data

Unnamed: 0,ages.,track,book.I'm,complicated,Wicked,I,"school,",last,kept,them,...,"Utterly,",stay,All,"DVD,CD",one,like,became,unless,"thing,",countries
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.001246,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003298,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003930,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003020,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001673,0.000000,0.0,0.0,0.0,0.0
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.005021,0.000000,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002175,0.000000,0.0,0.0,0.0,0.0


## Other vectorizations for comparison:

In [10]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

# import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'][0:99])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x.text[0:99])
xvalid_count =  count_vect.transform(valid_x.text[0:99])

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(trainDF['text'][0:99])
xtrain_tfidf =  tfidf_vect.transform(train_x.text[0:99])
xvalid_tfidf =  tfidf_vect.transform(valid_x.text[0:99])

Using TensorFlow backend.


In [11]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, y_valid_data, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, y_valid_data)

In [12]:
# Naive Bayes on tfidfcf
accuracy = train_model(naive_bayes.MultinomialNB(), np.array(tfidfcf_data), train_y[0:99], np.array(tfidfcf_data_valid),valid_y[0:99])
print("NB, WordLevel TF-IDF-CF: ", accuracy)

# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y[0:99], xvalid_count,valid_y[0:99])
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y[0:99], xvalid_tfidf,valid_y[0:99])
print("NB, WordLevel TF-IDF: ", accuracy)

NB, WordLevel TF-IDF-CF:  0.5353535353535354
NB, Count Vectors:  0.5959595959595959
NB, WordLevel TF-IDF:  0.5858585858585859


## Remarks :

custom TF-IDF-CF seems to be working poorly when compared to existing vectorizations but not actually bad .   
it might increase in performace with increase in data