In [62]:
import math
import numpy as np
import pandas as pd
import random

In [296]:
class DataProcessing:
    @staticmethod
    def shuffle(x):
        for i in range(len(x)-1):
            j = random.randint(0,i)
            x.iloc[i], x.iloc[j] = x.iloc[j], x.iloc[i]
        return x

class SoftSet:
    @staticmethod
    def classify(Y, sample):
        
        # sum normalized values
        match = {}
        for genre in Y: # for every genre in 0/1 set
            if genre not in match:
                match[genre] = 0
            for word in sample:        
                if word in Y[genre]:
                    match[genre] += sample[word]
        
        best = {k: v for k, v in sorted(match.items(), key=lambda item: item[1], reverse=True)}
        return best     
            
        # TODO return highest values                 


In [289]:
db = pd.read_csv('db.csv', usecols=['id', 'general_tags'])
db = DataProcessing.shuffle(db)

In [266]:
class Tools:
    
    @staticmethod
    def readAndCount(book_id):
        # read file
        words = ''
        try:
            f = open('out_processed/libgen/' + str(book_id) + '.txt')
            words = f.read()
            words = words.split()
            
        except FileNotFoundError: # temporary
            global db
            db = db.drop(db.loc[db['id'] == book_id].index)
        
        # count unique words
        count = {}
        for word in words:
            if word in count:
                count[word] += 1
            else:
                count[word] = 1
                
        # sorting
        count = {k: v for k, v in sorted(count.items(), key=lambda item: item[1], reverse=True)}
        
        return count
    
    @staticmethod
    def countWords(book_id):
        
        count = Tools.readAndCount(book_id)
        
        count = dict(list(count.items())[0: 1000])

        # normalize
        normalized = {}
        count_sum = sum(count.values())
        for word in count:
            normalized[word] = count[word]/count_sum
            
        return normalized
    
    @staticmethod
    def getSet(book_id):
        
        count = Tools.readAndCount(book_id)
        
        tags_and_words = list(count.keys())[0: 1000]


        return tags_and_words

In [297]:
# list of all genres
genres = []
for book in db.iloc:
    genre = book['general_tags']
    if genre not in genres:
        genres.append(genre)

n = 400 # temp limit

# normalized top words of all books aka samples
samples = []
for i in range(n):
    samples.append(Tools.countWords(db.iloc[i][0]))

# 0/1 top words with tags aka soft db
soft_db = {}
for i in range(n):
    soft_db[db.iloc[i][1]] = Tools.getSet(db.iloc[i][0])

In [294]:
db.head(1)

Unnamed: 0,id,general_tags
0,603,Mathematics


In [295]:
SoftSet.classify(soft_db, samples[0])

{'Mathematics': 0.7751923387313716,
 'Technology': 0.7533760154947015,
 'Education;Education': 0.7424275030935611,
 'Medicine': 0.6916527680637009,
 'Education;Psychology;Psychology;Psychology;Psychology;Psychology;Psychology;Psychology;Psychology;Psychology': 0.6754048528541459,
 'Technology;Technology;Technology;Technology': 0.6662048743745633,
 'Business;Business': 0.6642411362780446,
 'Psychology': 0.6327271749071938,
 'Art': 0.6246435680852215,
 'Medicine;Medicine;Medicine;Medicine': 0.6119868725453277,
 'Computers': 0.6101172862753537,
 'Chemistry': 0.6078845429601338,
 'Economy': 0.6019798784096417,
 'Business': 0.6019126271049663,
 'Physical Educ. and Sport;Physical Educ. and Sport': 0.580768816915049,
 'Geology': 0.5618308495184814,
 'Biology': 0.5581723785441438,
 'Business;Education': 0.5551057190509502,
 'Housekeeping': 0.5504653790283538,
 'Education': 0.5334642492064349,
 'Physics': 0.5278420401355782,
 'Technology;Technology;Technology;Technology;Technology;Technology;Te