In [1]:
import math
import pandas as pd
import random
from collections import Counter

In [2]:
class DataProcessing:
    @staticmethod
    def shuffle(x):
        for i in range(len(x)-1):
            j = random.randint(0,i)
            x.iloc[i], x.iloc[j] = x.iloc[j], x.iloc[i]
        return x
    
    @staticmethod
    def split(x):
        size = len(x)
        first_part = math.ceil(size*0.3) # 30% : 70%
        a = x[:first_part]
        b = x[first_part:]
        return a, b

class SoftSet:
    @staticmethod
    def classify(Y, sample):
        
        # sum normalized values
        match = {}
        for genre in Y: # for every genre in 0/1 set
            if genre not in match:
                match[genre] = 0
            for word in sample:        
                if word in Y[genre]:
                    match[genre] += sample[word] + Y[genre][word]
        
        # sorting
        best = {k: v for k, v in sorted(match.items(), key=lambda item: item[1], reverse=True)}
        
        # return 4 best tags
        return dict(list(best.items())[:4])
    
class Tools:
    
    @staticmethod
    def readAndCount(book_id):
        # read file
        words = ''
        try:
            f = open('out_processed/libgen/' + str(book_id) + '.txt')
            words = f.read()
            words = words.split()
            
        except FileNotFoundError: # if the file doesn't exist for some reason
            global db
            db = db.drop(db.loc[db['id'] == book_id].index)
        
        # count unique words
        count = {}
        for word in words:
            if word in count:
                count[word] += 1
            else:
                count[word] = 1
                
        # sorting
        count = {k: v for k, v in sorted(count.items(), key=lambda item: item[1], reverse=True)}
        
        return count
    
    @staticmethod
    def countWords(book_id): # for sample
        
        count = Tools.readAndCount(book_id)        
        count = dict(list(count.items())[0: 400]) # number of top words in a sample

        # normalize
        normalized = {}
        count_sum = sum(count.values())
        for word in count:
            normalized[word] = count[word]/count_sum
            
        return normalized
    
    @staticmethod
    def getSet(old_dict, book_id): # for training set
        
        count = Tools.readAndCount(book_id)        
        count = dict(list(count.items())[0: 400]) # number of top words for a given tag
        
        soft_normalized = {}
        count_sum = sum(count.values())
        for word in count:
            soft_normalized[word] = count[word]/count_sum
        
        # sum of the old dict and new one
        combined = dict(Counter(old_dict)+Counter(soft_normalized))
        combined = {k: v for k, v in sorted(combined.items(), key=lambda item: item[1], reverse=True)}
        combined = dict(list(combined.items())[0: 400]) # limit to 400

        return combined

In [3]:
db = pd.read_csv('db.csv', usecols=['id', 'general_tags'])
db = DataProcessing.shuffle(db)

# split multi-tags
db = db.assign(general_tags=db['general_tags'].str.split(';')).explode('general_tags')

validation, train = DataProcessing.split(db)

In [4]:
# top words, training set
soft_db = {}
for i in range(len(train)):
    try:
        if train.iloc[i][1] in soft_db:
            soft_db[train.iloc[i][1]] = Tools.getSet(soft_db[train.iloc[i][1]], train.iloc[i][0])
        else:
            soft_db[train.iloc[i][1]] = Tools.getSet({}, train.iloc[i][0])
    except:
        pass

In [5]:
# validation samples
samples = []
for i in range(len(validation)):
    try:
        samples.append(Tools.countWords(validation.iloc[i][0]))
    except:
        pass

In [6]:
correct = 0
for i in range(len(validation)):
    if validation.iloc[i].general_tags in SoftSet.classify(soft_db, samples[i]):
        correct += 1
        
accuracy = correct / len(validation) * 100
print("Accuracy:", round(accuracy, 2),"%")

Accuracy: 77.07 %
