In [0]:
import numpy as np
from __future__ import unicode_literals
from hazm import *
import glob
import pandas as pd

In [0]:
def dataTokenize(txtFile):
  file = open(txtFile, "rt")
  data = file.read()

  punctuations = ':؛)([]!،.؟'
  noPunctData = ""
  for char in data:
    if char not in punctuations:
      noPunctData = noPunctData + char

  normalizer = Normalizer()
  data = normalizer.normalize(noPunctData)
  dataTokens = word_tokenize(data)
  
  return dataTokens

In [0]:
def queryTokenize(query, stopWords = 'Yes'):
  if stopWords == 'No':
    query = query.replace(' در ', ' ').replace(' از ', ' ').replace(' به ', ' ').replace(' را ', ' ').replace(' و ', ' ').replace(' با ', ' ')
  normalizer = Normalizer()
  query = normalizer.normalize(query)
  queryTokens = word_tokenize(query)

  return queryTokens

In [6]:
queries = ["سهمیه کشورهای آسیایی در فیفا", "سرمربی تیم ملی ایران افشین قطبی", "دانشمند مطرح در زمینه سلولهای بنیادی", "دین و علم در کلام شهردار تهران"]
allFolders = glob.glob('/NLP/Data/*')

for query in queries:
  adresses = {}
  allScoresT = []
  allScoresS_tf = []
  allScoresS_tfidf = []
  queryTokens = queryTokenize(query, stopWords = 'No')
  for folder in allFolders:
    adresses[folder] = glob.glob(folder + '/*')
    for txtFile in adresses[folder]:
      dataTokens = dataTokenize(txtFile)
      count = {}
      for word in queryTokens:
        count[word] = 0 
        if word in dataTokens:
          for w in dataTokens:
            if word == w:
              count[word] += 1

      # tf Score
      tf = 0
      for j in count:
        if count[j] > 0:
          tf += 1 + np.log10(count[j])

      #tf.idf Score
      tfidf = 0
      N = len(adresses[folder])
      for j in count:
        TF = 0
        df = 0
        for text in adresses[folder]:
          tempTokens = dataTokenize(text)
          if j in tempTokens:
            df += 1
        if count[j] > 0:
          TF = 1 + np.log10(count[j])
        if df == 0:
          df = 1
        tfidf += (TF * np.log10(N/df))

      allScoresT.append(txtFile)
      allScoresS_tf.append(tf)
      allScoresS_tfidf.append(tfidf)

  allScoresT = np.asarray(allScoresT) 
  allScoresS_tf = np.asarray(allScoresS_tf)
  allScoresS_tfidf = np.asarray(allScoresS_tfidf)  
  tfIndex = np.argmax(allScoresS_tf)
  tfidfIndex = np.argmax(allScoresS_tfidf)
  print("Query: ", query)
  print("by tf: ", allScoresT[tfIndex])
  print("by tf.idf: ", allScoresT[tfidfIndex])
  print('tf = ' , allScoresS_tf[tfIndex])
  print('tfidf = ', allScoresS_tfidf[tfidfIndex])
  print('----------------------------')

Query:  سهمیه کشورهای آسیایی در فیفا
by tf:  /content/gdrive/My Drive/Colab Notebooks/NLP/Data/ورزشی/13840320-txt-0539122_utf.txt
by tf.idf:  /content/gdrive/My Drive/Colab Notebooks/NLP/Data/ورزشی/13840320-txt-0539122_utf.txt
tf =  5.146128035678238
tfidf =  2.579404448847027
----------------------------
Query:  سرمربی تیم ملی ایران افشین قطبی
by tf:  /content/gdrive/My Drive/Colab Notebooks/NLP/Data/ورزشی/13880319-txt-1353520_utf.txt
by tf.idf:  /content/gdrive/My Drive/Colab Notebooks/NLP/Data/ورزشی/13880319-txt-1353520_utf.txt
tf =  11.36078268987328
tfidf =  2.9809100171666443
----------------------------
Query:  دانشمند مطرح در زمینه سلولهای بنیادی
by tf:  /content/gdrive/My Drive/Colab Notebooks/NLP/Data/فناوري/13850903-txt-0830601_utf.txt
by tf.idf:  /content/gdrive/My Drive/Colab Notebooks/NLP/Data/فناوري/13850903-txt-0830601_utf.txt
tf =  4.556302500767288
tfidf =  2.404047171865991
----------------------------
Query:  دین و علم در کلام شهردار تهران
by tf:  /content/gdrive/My