In [1]:
import requests
from utils.dbobjects import *
from utils.utils import *
from urllib.request import urlretrieve
import patoolib
from comsear import ComicVineClient
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError
import pprint
import functools
import json
import shutil
import re
import os
import sys
from pathlib import Path
import datetime
import numpy as np
api_key = 'be9301c9c1770a0c729635a06a4513ad9d95410c'
cv = ComicVineClient(api_key)
root = './data/'
from tqdm.auto import tqdm
from IPython.display import clear_output

In [2]:
class LibraryParser:
    def __init__(self, library_path):
        self.library_path = Path(library_path)
        self.n_comics = None
        self.n_volumes = None
        
    def scan_root_folder(self):
        folders = list({each.parent for each_ext in ['*.cbr', '*.cbz'] for each in mainp.rglob(each_ext)})
        #subs = {subdir:files for subdir, dirs, files in os.walk(self.library_path) for name in files if name.endswith(('cbr', 'cbz'))}
        for each in folders:
            if 'Issue' in each.name or 'TPB' in each.name:
                to_search.append(each.parent.name)
            else:
                to_search.append(each.name)
        
        

In [3]:
class ComicParser():     
    def __init__(self, path):
        self.comic_name = None
        self.comic_year = None
        self.issue_number = None
        self.comic_path = path
        self.comic_metadata = {}
        self.comic_initiated = False

    def parseComic(self):
        self.comic_name = os.path.splitext(os.path.basename(self.comic_path))[0]
        years = re.findall(r'\(([12]\d{3})', self.comic_name)
        if years:
            year_within_brackets = years[0]
            if (len(year_within_brackets) == 4): #Stupid way of doing but good for now.
                self.comic_year = int(year_within_brackets)
            else:
                self.comic_year = 0
        self.issue_number = re.findall(r"(\d+\.?\d?[a-zA-Z]{0,3}?)(?:\s*\(of|\s*\([12]\d{3}\))", self.comic_path)
        #Get the digits or digits with point number (19.1) or digits followed by letters (19.INH) either before "(of" or before a year. 

In [4]:
class VolumeParser:
    def __init__(self, path):
        self.volume_name = None
        self.type = None #TPB or Single Issues
        self.volume_path = path
        self.volume_initiated = False
        self.volumeDict = {'comics_in_folder' : {} } 
        self.volumeFetch = True
        self.volumeTable = None
        self.searchReturned = None
        self.HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:7.0) '
                     'Gecko/20130825 Firefox/36.0'}
        self.params = {'api_key':api_key, 'format': 'json'} 
                
    def parseVolume(self):        
        #There still remains an issue with forward slash in comic path
        if 'Issues' in os.listdir(self.volume_path):
            comic_dir = os.path.join(self.volume_path, 'Issues')            
        else:    
            comic_dir = self.volume_path
        if '(' in self.volume_path:
            self.volume_name = os.path.basename(self.volume_path).split(' (')[0] 
        else:    
            self.volume_name = os.path.basename(self.volume_path) 
            
        self.volumeDict['from_file_volume_name'] = self.volume_name        
        for entry in os.listdir(comic_dir):
            if not entry.startswith(".") and check_comic(entry):
                comic = ComicParser(os.path.join(comic_dir,entry)) #convert to path join volume path and entry
                comic.parseComic()            
                self.volumeDict['comics_in_folder'].update({comic.comic_name : comic})
                                
        print(str(len(self.volumeDict['comics_in_folder']))+' comics added in volume ' + self.volume_name +'.')    
        
    def searchQuery(self,api_key):
        """
        Search Comic Vine Servers
        """
        print("Fetching from Comic Vine servers")
        years = [self.volumeDict['comics_in_folder'][key].comic_year for key in self.volumeDict['comics_in_folder'].keys() if self.volumeDict['comics_in_folder'][key].comic_year]        
        searchtext = self.volumeDict['from_file_volume_name']
        if years:
            searchtext += (' '+str(years[0]))
        print('Querying for '+searchtext)
        firstresponse = cv.search(searchtext, resources=['volume']) #Search for volume name with year
        self.searchReturned = firstresponse.results
        
    def findBestSearch(self):
        """

        Finding best search among Volumes. 2 main information considered 
        here are number of files in folder folder (which is assumed to be 
        total number of issues in that volume) and year range obtained 
        from comic file names.        
        
        """
        print("Finding best match")
        expected_num_comics = len(self.volumeDict['comics_in_folder'])    
        existing_comics = list(self.volumeDict['comics_in_folder'].keys())
        years = [self.volumeDict['comics_in_folder'][key].comic_year for key in self.volumeDict['comics_in_folder'].keys() if self.volumeDict['comics_in_folder'][key].comic_year]
        
        
        flag=False
        best_index = 0
        for indx, each_result in enumerate(self.searchReturned):    
            if indx is 0:
                each_result['match_score'] = 30.0 #First result returned is the best result.
            elif indx is not 0:
                each_result['match_score'] = 0.0
            if expected_num_comics == each_result['count_of_issues']:
                each_result['match_score'] += 10.0 
            if years:    
                if (min(years)<=int(each_result['start_year'])<=max(years)):
                    each_result['match_score'] +=40.0
                    
                    
            ######If scores are still really terrible, then do individual issues check for all issues in search returns.         
            if flag and each_result['match_score']<=40:
                detailresponse = requests.get(each_result['api_detail_url'], headers=self.HEADERS, params=self.params)
                #print(detailresponse.json())
                titles_in_vol = [each['name'] for each in detailresponse.json()['results']['issues']]
                each_result['match_score'] +=fuzz.token_set_ratio(titles_in_vol, exist)
            
            
            
                    
            if each_result['match_score']>=np.array(self.searchReturned)[best_index]['match_score']: #Also assumed that search results are coming back sorted in relavance.
                best_index = indx
        self.volumeDict['best_search']=np.array(self.searchReturned)[best_index]
        print('Match Score : '+ str(self.volumeDict['best_search']['match_score']))        
    
    def confirmResult(self):
        print_com_meta(self.volumeDict['best_search'], 'Best Result:')
        confirmation = query_yes_no("Is the information correct?")
        choice = None
        while True:
            clear_output()
            if not confirmation: 
                for idx, each in enumerate(self.searchReturned):
                    print_com_meta(each, idx+1)
                choice = sanitised_input("Please make a manual choice between 1-10 : ", int, 1,10)
                print_com_meta(self.searchReturned[choice-1], 'Your Choice : ')
                confirmation = query_yes_no("Is the information correct?")
            else:
                if choice:
                    self.volumeDict['best_search'] = self.searchReturned[choice-1]
                break
    
    def fetchVolumeMetadata(self):                                        
        detailresponse = requests.get(self.volumeDict['best_search']['api_detail_url'], headers=self.HEADERS, params=self.params)
        fetched_volume = detailresponse.json()
        self.volumeDict['detailed_meta'] = fetched_volume['results']        
        #Image Directory
        orig_directory = './Resources/Images/VolumeArt/'
        img_name = os.path.basename(self.volumeDict['detailed_meta']['image']['original_url'])
        img_path = os.path.join(orig_directory,img_name)
        #Fetch Image
        urlretrieve(self.volumeDict['detailed_meta']['image']['original_url'], img_path)        
        self.volume_initiated = True
        volume = Volume(
                        id = self.volumeDict['detailed_meta']['id'],
                        name = self.volumeDict['detailed_meta']['name'],
                        aliases = self.volumeDict['detailed_meta']['aliases'],
                        count_of_issues = self.volumeDict['detailed_meta']['count_of_issues'],
                        date_added = self.volumeDict['detailed_meta']['date_added'],
                        date_last_updated = self.volumeDict['detailed_meta']['date_last_updated'],
                        deck = self.volumeDict['detailed_meta']['deck'],
                        description = re.sub(r'<[^>]*>', '', self.volumeDict['detailed_meta']['description']), #remove all within angle brackets
                        publisher = self.volumeDict['detailed_meta']['publisher']['name'], #Later to be changed to publisher object
                        start_year = self.volumeDict['detailed_meta']['start_year'],
                        comicvine_api_detail_url = self.volumeDict['detailed_meta']['api_detail_url'],
                        comicvine_image = self.volumeDict['detailed_meta']['image']['original_url'], #image
                        comicvine_site_detail_url = self.volumeDict['detailed_meta']['site_detail_url'],
                        local_path = self.volume_path,
                        local_image_path = img_path,
                        #character_credits = Column(String)
                        #concept_credits = Column(String)
                        #team_credits = Column(String)
                        #location_credits = Column(String)
                        #object_credits = Column(String)
                        #person_credits = Column(String)
                        )
        self.volumeTable = volume        
        
    def fetchComicsMetadata(self, api_key):
        comic_fetch_loop = tqdm(self.volumeDict['detailed_meta']['issues'])
        comicObjects = []
        issue_directory = './Resources/Images/IssueArt/'                        
        for each_issue in comic_fetch_loop:
            per_issue_response = requests.get(each_issue['api_detail_url'], headers=self.HEADERS, params=self.params).json()['results']
            img_name = os.path.basename(per_issue_response['image']['original_url'])
            img_path = os.path.join(issue_directory,img_name)
            #Fetch Image if doesnt exist
            if not os.path.isfile(img_path):
                urlretrieve(per_issue_response['image']['original_url'], img_path) 
            
            comic = Comic(id = per_issue_response['id'],
                          name = per_issue_response['name'],
                          aliases = per_issue_response['aliases'],
                          deck = per_issue_response['deck'],
                          description = re.sub(r'<[^>]*>', '', per_issue_response['description']),
                          cover_date = per_issue_response['cover_date'],
                          date_added = per_issue_response['date_added'],
                          date_last_updated = per_issue_response['date_last_updated'],
                          issue_number = per_issue_response['issue_number'],
                          comicvine_api_detail_url = per_issue_response['api_detail_url'],
                          comicvine_image = per_issue_response['image']['original_url'],
                          local_image_path = img_path
            )
            comicObjects.append(comic)
            comic_fetch_loop.set_postfix(Status='Added '+str(comic.name))            
        self.volumeTable.comics=set(comicObjects)
     
    def commit_to_db(self, dbpath='sqlite:///comicdb.db'):
        engine = create_engine(dbpath, echo=False)
        Base.metadata.create_all(engine)
        Session = sessionmaker(bind=engine)
        session = Session()
        session.add(self.volumeTable)
        
        try:
            session.commit()
        except IntegrityError:
            session.rollback()
        
        #session.commit()
        session.close()
        engine.dispose()

            
#             for key in self.volumeDict['comics_in_folder'].keys():
#                 if self.volumeDict['comics_in_folder'][key].issue_number == per_issue_response['issue_number'] and self.volumeDict['comics_in_folder'][key].comic_initiated == False:
#                     self.volumeDict['comics_in_folder'][key].comic_metadata = per_issue_response
#                     self.volumeDict['comics_in_folder'][key].comic_initiated = True

In [5]:
vol1 = VolumeParser('./data/Infinity Gauntlet')
vol1.parseVolume()
vol1.searchQuery(api_key)
vol1.findBestSearch()
vol1.confirmResult()
#vol1.fetchVolumeMetadata()

In [6]:
vol1.volumeDict['best_search']

{'aliases': None,
 'api_detail_url': 'https://comicvine.gamespot.com/api/volume/4050-4596/',
 'count_of_issues': 6,
 'date_added': '2008-06-06 11:08:19',
 'date_last_updated': '2019-04-04 02:35:48',
 'deck': None,
 'description': '<p>A six issue Limited Series that started the biggest cosmic event in the Marvel Universe! After the events of <a href="/the-thanos-quest/4050-4425/" data-ref-id="4050-4425"> The Thanos Quest</a>, Thanos the Mad Titan has come into possession of the Infinity Gems and wipes out half of the Universe in the blink of an eye for his romantic interest Death. It is up to the remaining heroes of earth and from beyond to stop the mad titan. This series contains the core story of the Infinity Gauntlet crossover event.</p><p>The monthly <a href="/warlock-and-the-infinity-watch/4050-4818/" data-ref-id="4050-4818"> Warlock and the Infinity Watch</a> spun right out of this mini-series.</p><p>This was the first part of the Infinity trilogy. It was continued in <a href="/th

In [7]:
vol1.fetchVolumeMetadata()

In [8]:
vol1.fetchComicsMetadata(api_key)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [9]:
vol1.commit_to_db()

In [10]:
subs = {subdir:files for subdir, dirs, files in os.walk('./data') for name in files if name.endswith(('cbr', 'cbz'))}

In [15]:
list(subs.keys())
#subs['./data/Thor/Thor Vol. 2 (1998-2004)/Issues']

['./data/Thor/Thor Vol. 2 (1998-2004)/Issues',
 './data/Hulk/Incredible Hulk 600-611 (2009-2010)/Issues',
 './data/Infinity Gauntlet/TPB',
 './data/Infinity Gauntlet/Infinity Gauntlet Aftermath/TPB',
 './data/Infinity Gauntlet/Issues',
 './data/Age of Ultron/Issues',
 './data/Marvel Graphic Novel  (01 - 05)',
 './data/Venom (Vol. 2)',
 './data/Inhumans/Inhumans (01-12) (1998-1999)']

In [28]:
vol1.volumeDict

{'comics_in_folder': {'Infinity Gauntlet 005 (1992) (Digital) (Zone-Empire)': <__main__.ComicParser at 0x11c4965c0>,
  'Infinity Gauntlet 003 (1991) (Digital) (Zone-Empire)': <__main__.ComicParser at 0x11c496358>,
  'Infinity Gauntlet 002 (1991) (Digital) (Zone-Empire)': <__main__.ComicParser at 0x11c496320>,
  'Infinity Gauntlet 004 (1991) (Digital) (Zone-Empire)': <__main__.ComicParser at 0x11c496630>,
  'Infinity Gauntlet 001 (1991) (Digital) (Zone-Empire)': <__main__.ComicParser at 0x11c4963c8>,
  'Infinity Gauntlet 006 (1992) (Digital) (Zone-Empire)': <__main__.ComicParser at 0x11c496080>},
 'from_file_volume_name': 'Infinity Gauntlet',
 'best_search': {'aliases': None,
  'api_detail_url': 'https://comicvine.gamespot.com/api/volume/4050-4596/',
  'count_of_issues': 6,
  'date_added': '2008-06-06 11:08:19',
  'date_last_updated': '2019-04-04 02:35:48',
  'deck': None,
  'description': '<p>A six issue Limited Series that started the biggest cosmic event in the Marvel Universe! After

In [12]:
mainp = Path('./data')

In [18]:
folders = list({each.parent for each_ext in ['*.cbr', '*.cbz'] for each in mainp.rglob(each_ext)})
folders

[PosixPath('data/Age of Ultron/Issues'),
 PosixPath('data/Hulk/Incredible Hulk 600-611 (2009-2010)/Issues'),
 PosixPath('data/Infinity Gauntlet/Infinity Gauntlet Aftermath/TPB'),
 PosixPath('data/Inhumans/Inhumans (01-12) (1998-1999)'),
 PosixPath('data/Thor/Thor Vol. 2 (1998-2004)/Issues'),
 PosixPath('data/Venom (Vol. 2)'),
 PosixPath('data/Infinity Gauntlet/TPB'),
 PosixPath('data/Marvel Graphic Novel  (01 - 05)'),
 PosixPath('data/Infinity Gauntlet/Issues')]

In [27]:
list(folders[0].glob('*'))[0].name

'Age of Ultron 009 (2013) (Digital) (Zone-Empire).cbr'

In [14]:
to_search = []
for each in folders:
    if 'Issue' in each.name or 'TPB' in each.name:
        to_search.append(each.parent.name)
    else:
        to_search.append(each.name)
to_search

['Age of Ultron',
 'Incredible Hulk 600-611 (2009-2010)',
 'Infinity Gauntlet Aftermath',
 'Inhumans (01-12) (1998-1999)',
 'Thor Vol. 2 (1998-2004)',
 'Venom (Vol. 2)',
 'Infinity Gauntlet',
 'Marvel Graphic Novel  (01 - 05)',
 'Infinity Gauntlet']