# **Import All the Required Libraries here**

In [26]:
import os
import re
import math
import numpy as np
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 1.**Making class for Preprocessing Data**


In [27]:
class Preprocessing:
  
  #To Load the Data 

  def load_data(self):
    global doc
    doc = list()
    k=os.listdir("business")
    k=sorted(k)
    data=list()
    for name in k:
      ##check if name contains index type files or not

      check = re.findall("index",str(name))
      if len(check) != 0:  ##contains index then continue and ignore this file
         continue
      #while listing the files we have to ignore these two files in the colab
      if name == '.config' or name == 'sample_data':
        continue
      else:
        f = open("business/"+ name,'r')
        soup = BeautifulSoup(f,"html.parser")
        required_data = soup.find("text")
        temp = required_data.text
        #check if document contains any words or not
        check = re.findall("[A-Za-z0-9]",temp)
        if len(check) == 0:
          continue
        data.append(temp)       
        doc_id = soup.find("docno")
        doc.append(doc_id)

    return data

    
  #function for removing numerical and punctuation from data
  def remove_num_punc(self,data):
      #retrieving each doc data and then removing numerical and punctuation from each document
      
      for i,doc_text in zip(range(len(data)),data):
        
        doc_text = re.sub(r'[^\w\s]', '',doc_text)
        doc_text = re.sub('\d','',doc_text)
        data[i] = doc_text
      return data

    
  #function for performing tokenization using nltk library
  def process_token(self,data):
      for i,doc_text in zip(range(len(data)),data):
        data_tokenized = nltk.word_tokenize(doc_text)
        data[i] = data_tokenized
      return data 
    
    

  #function for converting to lower case letter 
  def convert_to_lower(self,data):
      #converting to lower case letter
      for i,doc_text in zip(range(len(data)),data):
        data_smallcase = [word.lower() for word in doc_text]
        data[i] = data_smallcase
      return data
   

  #function for removing stop words 
  def remove_stop_words(self,data):
      stop_words = stopwords.words('english')
      
      for i,doc_text in zip(range(len(data)),data):
        data_without_stopwords  = list()
        for word in doc_text:
          if word not in stop_words:
            data_without_stopwords.append(word)
        data[i] = data_without_stopwords
      return data  

    
  #function to perform lemmatization 
  def perform_lemmatization(self,data_without_stopwords):
      lemmatizer = WordNetLemmatizer()
          
      after_data = list()
      
      for k,word_list in zip(range(len(data_without_stopwords)),data_without_stopwords):
        after_lemmatize = list()
        
        i=0
        while i < len(word_list):
          word = data_without_stopwords[k][i]
          after_lemmatize.append(lemmatizer.lemmatize(word,pos='v'))
          i += 1
        after_data.append(after_lemmatize)
        #data_without_stopwords[k][:] = after_lemmatize
      return after_data    
  

 

# 2.Making use of Preprocessing class

In [28]:
doc = list()
#creating preprocessing object class
obj=Preprocessing()

#loading the data
data = obj.load_data()

#removing punctuation and numerical values
data = obj.remove_num_punc(data)

#performing tokenization on data using nltk library
nltk_tokenize = obj.process_token(data)

#converting to lower case
data=obj.convert_to_lower(nltk_tokenize)

#removing stop words from the data
data = obj.remove_stop_words(data)

#performing lemmatization
lemmatized_data = obj.perform_lemmatization(data)

print(lemmatized_data)


[['telegraph', 'calcutta', 'business', 'corporate', 'brief', 'kanoria', 'chemicals', 'industries', 'invest', 'rs', 'crore', 'set', 'mw', 'power', 'plant', 'chemical', 'unit', 'target', 'rs', 'crore', 'turnover', 'thermal', 'power', 'plant', 'would', 'set', 'outlay', 'rs', 'crore', 'chloralkali', 'plant', 'would', 'set', 'cost', 'rs', 'crore', 'chairman', 'manage', 'director', 'r', 'v', 'kanoria', 'say', 'hummingbird', 'ltd', 'lead', 'global', 'provider', 'integrate', 'enterprise', 'content', 'management', 'ecm', 'solutions', 'launch', 'comprehensive', 'content', 'library', 'consolidation', 'solution', 'law', 'firm', 'part', 'enterprise', 'content', 'integration', 'solution', 'solution', 'design', 'help', 'law', 'firm', 'library', 'consolidation', 'efforts', 'minimise', 'time', 'require', 'consolidation', 'ensure', 'complete', 'data', 'integrity', 'availability', 'throughout', 'process', 'canon', 'image', 'technology', 'company', 'draw', 'plan', 'capture', 'per', 'cent', 'rapidly', 'gro

After the preprocessing is over we can move towards fiting and transforming the text to vectors.The preprocessed data is contained in lemmatized_data variable .structure of lemmatize data is as follows

lemmatized_data = [ [document1] , [document2] , ....                  [documentn] ]
where each document within the list contains the preprocessed list of words contained in that particular document



# 3.**Making custom class for TF-IDF**

following is the implementation of fit and Transform function . fit function will return the term_document matrix and transform function will take argument a document and will return term document matrix of that particular document

In [29]:
class TFIDF:
  def __init__(self,pre_processeddata):
    self.data = pre_processeddata
    #getting the unique list of words over all the document
    self.unique_list = self.extractUnique(pre_processeddata)
    ##sorting the features
    self.feature_list = sorted(self.unique_list)
    ##mapping unique words to particular number 
    self.unique_dict_words ={self.feature_list[i] : i for i in range(len(self.feature_list))}
    ##initializing term document matrix 
    global doc
    self.term_document_matrix = np.zeros((len(doc),len(self.unique_dict_words)))
    ##initializing idf_matrix
    self.idf_matrix = np.zeros((1,len(self.unique_dict_words)))
    

  #fit function will return term document matrix
  def fit(self,documents):
    #finding term-frequency matrix
    term_frequency_matrix = self.find_term_matrix(documents)
    #finding idf_matrix
    self.idf_matrix = self.find_idf_matrix()
    
    
    #broadcasting idf_matrix to every other documents
    broadcasted_idf_matrix = np.zeros((len(documents),len(self.feature_list)))
    for i in range(len(documents)):
      broadcasted_idf_matrix[i:] = self.idf_matrix
    
    #calculating term document_matrix
  
    #element wise multiplication using np.multiply 
    #multiplying term-frequency matrix with broadcasted_idf_matrix

    self.term_document_matrix = np.multiply(term_frequency_matrix,broadcasted_idf_matrix) 
   
    return self.term_document_matrix

  
  #transform will return term document matrix for the particular document
  def transform(self,document):
    #computing term frequency in the document
    #returns term_doc_matrix
    tf_matrix  = np.array(self.find_term_matrix(document))
    return np.multiply(tf_matrix, self.idf_matrix)
    
  
  def find_term_matrix(self,documents):
    global lemmatized_data
    term_frequency_matrix = list()
    if type(documents[0]) is not list:
      document = documents
      total_words=len(lemmatized_data[lemmatized_data.index(document)])
      document_frequency = list()
      for word in self.feature_list :
        count = document.count(word)
        document_frequency.append(count/total_words)
      term_frequency_matrix.append(document_frequency)
      return term_frequency_matrix
    else:
      
      for document in documents:
        doc_id = lemmatized_data.index(document)
        total_words_in_doc = len(lemmatized_data[doc_id])
        document_frequency = list()
        for word in self.feature_list:
          count = document.count(word)
          document_frequency.append(count/total_words_in_doc)
        term_frequency_matrix.append(document_frequency)
      return term_frequency_matrix
  
  def find_idf_matrix(self):
    #return list which stores the idf value of each unique term
    global lemmatized_data
    
    idf_matrix=list()
    for word in self.feature_list:
      
      #count no of document containing the particular term
      count=0
      for document in lemmatized_data:
        if word in document:
          count += 1
        
      #finding idf using formula math.log10(N / float(freq)
      value = len(lemmatized_data)/float(count)
      idf = math.log10(value)
      
      idf_matrix.append(idf)
    return idf_matrix

  
  def extractUnique(self,data):
    #In all words we are storing all the words taken from different doc
    all_words = list()
    for i,doc_value in zip(range(len(data)),data):
          for word in doc_value:
            all_words.append(word)
    #returning the list   of unique words from the all_words
    #first we convert the list to set so that it only contains unique elements
    #afterwards we convert it back to list because we want to work with the list data
    all_words_set = set(all_words)
    all_unique_words = list(all_words_set)
    return all_unique_words




#**creating object of TFIDF class and then calling fit and transform function using that object.**

In [30]:
obj_tfidf = TFIDF(lemmatized_data) #created object of TFIDF class


In [31]:
term_doc_matrix = obj_tfidf.fit(lemmatized_data) #calling fit function
 
print(term_doc_matrix.shape)

(16, 1578)


passing the document to transform function and it must return term document matrix of that particular document

In [32]:
data = obj_tfidf.transform(lemmatized_data[0]) #calling transform function.
print(data.shape)

(1, 1578)


#**4 using sklearn tfidf vectorizer**

In [33]:

global doc

data  = obj.load_data()                                                                         #load_data

corpus = list()
document_array=list()

for i,value in zip(range(len(data)),data):
  corpus.append(value)
  document_array.append(str(doc[i]));

vectorizer = TfidfVectorizer(analyzer='word' , stop_words='english',)                          #preprocessing
X = vectorizer.fit_transform(corpus)                                                           #fetching object of term document matrix 
tfidf_tokens = vectorizer.get_feature_names()                                                  #getting feature names
df_tfidfvect = pd.DataFrame(data = X.toarray(),index = document_array,columns = tfidf_tokens)  #panda dataframe
print(X.shape)


(16, 1932)


Here we can see that using sklearn library for tfidf vectorization ,it generates more unique words compared to our custom made approach , this is because the inbuilt library does not lemmatize the data hence for a single root word there may be more than one word belonging to the same word family..for example if a document contains **"document"** and **"documentation"** word, our method will lemmatize the **"documentation"** to **"document"** and hence producing only one root word which is **"document"** , but sklearn tdfidf vectorizer doesnot do this particular lemmatization and hence this words are treated differently therefore we can see more number of words in the sklearn tdfdf vectorizer compared to custom made tdfidf vectorizer

#**5.Showing top 5 words representing each document from the first five document using sklearn tdfvectorizer**

In [34]:
first_five_doc = [str(ele) for ele in doc[0:5]]                                       #first five doc 

required_data_frame  = df_tfidfvect.head(5)                               #fetching panda dataframe for first five doc


for i in range(5):
  df1 = required_data_frame.iloc[i:] #fetching the data from ith row to last row
  one_row = df1.head(1) #fetching the first record 
  dfObj = one_row.sort_values(by = first_five_doc[i], ascending=False,axis=1) #sorting accordind to columns in descending order
  dfObj = dfObj.iloc[:,0:5]  #fetching top 5 columns
  dataframe = pd.DataFrame(dfObj)
  display(dataframe)


Unnamed: 0,consolidation,content,idbi,enterprise,crore
<docno>1040901_business_story_3700171.utf8</docno>,0.181529,0.181529,0.181529,0.158089,0.157629


Unnamed: 0,policy,trade,foreign,export,exports
<docno>1040901_business_story_3700827.utf8</docno>,0.298825,0.244493,0.195369,0.162808,0.162808


Unnamed: 0,centre,patni,professionals,financial,chennai
<docno>1040901_business_story_3701515.utf8</docno>,0.375359,0.331264,0.198759,0.198759,0.173094


Unnamed: 0,bharat,petro,corporation,refineries,kochi
<docno>1040901_business_story_3701518.utf8</docno>,0.4327,0.361351,0.211189,0.180675,0.180675


Unnamed: 0,11,12,01,10,16
<docno>1040901_business_story_3701887.utf8</docno>,0.304561,0.279866,0.207941,0.202783,0.198489


Here, Above Top 5 words for the first 5 respective document is shown..first column word for each document is having 1st rank ,second column word has second rank and so on up to 5th columnar word.The value at each cell is the corresponding tf-idf score.

#**5.1 showing top 5 words representing each document from the first 5 documents using custom approach**

In [35]:
for i in range(5):
  T_D_M  = obj_tfidf.term_document_matrix
  tf_idf_doc = T_D_M[i]
  tf_idf_pair = [(i,tfidf) for i,tfidf in zip(range(len(obj_tfidf.feature_list)),tf_idf_doc)]
  tf_idf_pair.sort(key= lambda ele:ele[1],reverse=True)
  first_five = tf_idf_pair[0:5]
  global doc
  print("\n\ndocument=",doc[i])
  X=list()
  rank=1
  for pair in first_five:
    #fetching tf-idf score
    tf_idf = pair[1]
    word_code =pair[0]
    #fetching word from word_code
    word = obj_tfidf.feature_list[word_code]
    X.append([rank,word,tf_idf])
    rank += 1
  dframe = pd.DataFrame(data = X,columns = ["rank","word","tf-idf-score"])    
  print("\n")
  display(dframe)
  




document= <docno>1040901_business_story_3700171.utf8</docno>




Unnamed: 0,rank,word,tf-idf-score
0,1,consolidation,0.014566
1,2,content,0.014566
2,3,idbi,0.014566
3,4,award,0.010924
4,5,enterprise,0.010924




document= <docno>1040901_business_story_3700827.utf8</docno>




Unnamed: 0,rank,word,tf-idf-score
0,1,policy,0.013553
1,2,export,0.013216
2,3,foreign,0.010639
3,4,competitiveness,0.008811
4,5,earn,0.008811




document= <docno>1040901_business_story_3701515.utf8</docno>




Unnamed: 0,rank,word,tf-idf-score
0,1,patni,0.033448
1,2,centre,0.026758
2,3,financial,0.020069
3,4,professionals,0.020069
4,5,chennai,0.015051




document= <docno>1040901_business_story_3701518.utf8</docno>




Unnamed: 0,rank,word,tf-idf-score
0,1,bharat,0.034137
1,2,petro,0.028965
2,3,kochi,0.016551
3,4,refineries,0.016551
4,5,corporation,0.01499




document= <docno>1040901_business_story_3701887.utf8</docno>




Unnamed: 0,rank,word,tf-idf-score
0,1,uti,0.061606
1,2,equity,0.033603
2,3,hdfc,0.02987
3,4,income,0.02987
4,5,mip,0.02987


#observation
As we can see using sklearn library v/s using custom method .the result slightly varies but not up to a great extent it is because this difference noted is majorly because of the different preprocessing approach and the different formula for calculation of inverse document frequency.