In [38]:
"""
Path must be one between 2 tracks. 
1) Either do it volume wise, from which we identify issues and start matching individual file with it. 
   
   Backbone : Dependency on folder names.
   Benefits : If incomplete number of records there, suggest filling the gaps.
   Cons     : Individual comics can't be easily tagged.
              Everything muct be in folders and structured to some extent. 
              
2) We identify each book, and associate it with a volume. Metadata includes a volume entry. This way, we should start with :
   Pick a book from the folder. Retrieve metadata. Identify the volume. Pull issues of the rest.
   
   Backbone : Dependency on individual file names. 
   Benefits : Not so much.
   Cons     : Need not be the right comic. Rest of the actions heavy load. 
"""

import patoolib
from comsear import ComicVineClient
import pprint
import functools
import requests
import json
import re
pp = pprint.PrettyPrinter()
import os
import datetime
import numpy as np
api_key = 'be9301c9c1770a0c729635a06a4513ad9d95410c'
cv = ComicVineClient(api_key)
root = './data/'


In [68]:
class Comic():     
    def __init__(self, path):
        self.comic_name = None
        self.comic_year = None
        self.issue_number = None
        self.comic_path = path
        self.comic_metadata = {}
        self.comic_initiated = False

    def parseComic(self):
        self.comic_name = os.path.splitext(os.path.basename(self.comic_path))[0]
        year_within_brackets = re.findall(r'\(([12]\d{3})', self.comic_name)[0]
        if (len(year_within_brackets) == 4): #Stupid way of doing but good for now.
            self.comic_year = int(year_within_brackets)
        else:
            self.comic_year = 0
        self.issue_number = re.findall(r"(\d+\.?\d?[a-zA-Z]{0,3}?)(?:\s*\(of|\s*\([12]\d{3}\))", self.comic_path)
        #Get the digits or digits with point number (19.1) or digits followed by letters (19.INH) either before "(of" or before a year. 

In [81]:
class Volume:
    self.HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:7.0) '
                     'Gecko/20130825 Firefox/36.0'}
    self.params = {'api_key':api_key, 'format': 'json'} 
    def __init__(self, path):
        self.volume_name = None
        self.type = None #TPB or Single Issues
        self.volume_path = path
        self.volume_initiated = False
        self.volumeDict = {'comics_in_folder' : {} } 
        self.volumeFetch = True
        
    def parseVolume(self):
        self.volume_name=os.path.basename(self.volume_path).split(' (')[0] 
        self.volumeDict['from_file_volume_name'] = self.volume_name        
        for entry in os.listdir(self.volume_path):
            comic = Comic(os.path.join(self.volume_path,entry)) #convert to path join volume path and entry
            comic.parseComic()            
            self.volumeDict['comics_in_folder'].update({comic.comic_name : comic})
                                
        print(str(len(self.volumeDict['comics_in_folder']))+' comics added in volume ' + self.volume_name +'.')    
        
    def fetchVolumeMetadata(self, api_key):                
        """        
        Finding best search among Volumes. 2 main information considered 
        here are number of files in folder folder (which is assumed to be 
        total number of issues in that volume) and year range obtained 
        from comic file names.        
        
        """
        print("Fetching from Comic Vine servers")
        firstresponse = cv.search(self.volumeDict['from_file_volume_name'], resources=['volume'])
        returned = firstresponse.results
        print("Finding best match")
        expected_num_comics = len(self.volumeDict['comics_in_folder'])    
        years = [self.volumeDict['comics_in_folder'][key].comic_year for key in self.volumeDict['comics_in_folder'].keys()]
        best_index = 0
        for indx, each_result in enumerate(returned):    
            each_result['match_score'] = 0.0
            if expected_num_comics == each_result['count_of_issues']:
                each_result['match_score'] += 20.0        
            if (min(years)<=int(each_result['start_year'])<=max(years)):
                each_result['match_score'] +=20.0
            if each_result['match_score']>np.array(returned)[best_index]['match_score']: #Also assumed that search results are coming back sorted in relavance.
                best_index = indx
        self.volumeDict['best_search']=np.array(returned)[best_index]
        detailresponse = requests.get(self.volumeDict['best_search']['api_detail_url'], self.headers=self.HEADERS, params=self.params)
        fetched_volume = detailresponse.json()
        self.volumeDict['detailed_meta'] = fetched_volume['results']
        self.volume_initiated = True
        print('Match Score : '+ str(self.volumeDict['best_search']['match_score']))
        
    def fetchComicsMetadata(self, api_key):
        for each_issue in self.volumeDict['detailed_meta']['issues']:
            per_issue_response = requests.get(each_issue['api_detail_url'], headers=self.HEADERS, params=self.params).json()['results']            
            for entry in self.volumeDict['comics_in_folder']:
                if entry.values().issue_number == per_issue_response['issue_number']:
                    entry.values()['comic_metadata']=per_issue_response
            
            
        
        

In [82]:
parser1 = Volume('./data/Inhumans (01-12) (1998-1999)')
parser1.parseVolume()
parser1.fetchVolumeMetadata(api_key)

12 comics added in volume Inhumans.
Fetching from Comic Vine servers
Finding best match
Match Score : 40.0


In [141]:
s = "04iNh (of 04) 05a (of 04) 03(of 04) Winter Soldier 019.3 (2013) (Digital) (Fawkes-Empire) Young Avengers Presents 06 (of 06) (2008) (Digital) (Zone-Empire)"
slist = re.findall(r"(\d+\.?\d?[a-zA-Z]{0,3}?)(?:\s*\(of|\s*\([12]\d{3}\))", s)
for i in slist:
    print(i.zfill(10))

0000004iNh
000000005a
0000000003
00000019.3
0000000006


In [97]:
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:7.0) '
                     'Gecko/20130825 Firefox/36.0'}
params = {'api_key':api_key, 'format': 'json'} 
per_issue_response = requests.get('https://comicvine.gamespot.com/api/issue/4000-45868/', headers=HEADERS, params=params)
pp.pprint(per_issue_response.json()['results'])

{'aliases': None,
 'api_detail_url': 'https://comicvine.gamespot.com/api/issue/4000-45868/',
 'character_credits': [{'api_detail_url': 'https://comicvine.gamespot.com/api/character/4005-23075/',
                        'id': 23075,
                        'name': 'Bixby',
                        'site_detail_url': 'https://comicvine.gamespot.com/bixby/4005-23075/'},
                       {'api_detail_url': 'https://comicvine.gamespot.com/api/character/4005-4329/',
                        'id': 4329,
                        'name': 'Black Bolt',
                        'site_detail_url': 'https://comicvine.gamespot.com/black-bolt/4005-4329/'},
                       {'api_detail_url': 'https://comicvine.gamespot.com/api/character/4005-2115/',
                        'id': 2115,
                        'name': 'Crystal',
                        'site_detail_url': 'https://comicvine.gamespot.com/crystal/4005-2115/'},
                       {'api_detail_url': 'https://comicvine.gamespot.c

In [90]:
issues_urls = []
for each_issue in parser1.volumeDict['detailed_meta']['issues']:
    issues_urls.append(each_issue['api_detail_url'])

pp.pprint(issues_urls)

['https://comicvine.gamespot.com/api/issue/4000-45868/',
 'https://comicvine.gamespot.com/api/issue/4000-98504/',
 'https://comicvine.gamespot.com/api/issue/4000-98505/',
 'https://comicvine.gamespot.com/api/issue/4000-98506/',
 'https://comicvine.gamespot.com/api/issue/4000-98507/',
 'https://comicvine.gamespot.com/api/issue/4000-98508/',
 'https://comicvine.gamespot.com/api/issue/4000-98509/',
 'https://comicvine.gamespot.com/api/issue/4000-98510/',
 'https://comicvine.gamespot.com/api/issue/4000-98511/',
 'https://comicvine.gamespot.com/api/issue/4000-98512/',
 'https://comicvine.gamespot.com/api/issue/4000-98513/',
 'https://comicvine.gamespot.com/api/issue/4000-98514/']


In [145]:
from PIL import Image
import pytesseract

In [80]:
my_dict(parser1)

{'volume_name': 'Inhumans',
 'type': None,
 'volume_path': './data/Inhumans (01-12) (1998-1999)',
 'volume_initiated': True,
 'volumeDict': {'comics_in_folder': {'Inhumans 006 (1999)': <__main__.Comic at 0x11ac280f0>,
   'Inhumans 012 (1999)': <__main__.Comic at 0x11ac25668>,
   'Inhumans 003 (1999)': <__main__.Comic at 0x11ac259b0>,
   'Inhumans 001 (1998)': <__main__.Comic at 0x11ac25dd8>,
   'Inhumans 004 (1999)': <__main__.Comic at 0x11ac257b8>,
   'Inhumans 009 (1999)': <__main__.Comic at 0x11ac25e48>,
   'Inhumans 010 (1999)': <__main__.Comic at 0x11ac25b70>,
   'Inhumans 007 (1999)': <__main__.Comic at 0x11ac25828>,
   'Inhumans 002 (1998)': <__main__.Comic at 0x11ac25550>,
   'Inhumans 011 (1999)': <__main__.Comic at 0x11ac25588>,
   'Inhumans 005 (1999)': <__main__.Comic at 0x11ac255f8>,
   'Inhumans 008 (1999)': <__main__.Comic at 0x11ac255c0>},
  'from_file_volume_name': 'Inhumans',
  'best_search': {'aliases': None,
   'api_detail_url': 'https://comicvine.gamespot.com/api/v

In [None]:
#Different ways of walking through sub-directories and making a dictionary out of folder and file names
def complete_manual_try():
    comicFiles = []
    volumeFolders = []
    for path, dirs, files in os.walk(root): #Get folder names and Comic names
        for name in files:
            if name != '.DS_Store':
                comicFiles.append(name.split('.')[0])
        for them in dirs:
            if them != '.DS_Store':
                volumeFolders.append(them.split('(')[0])
    print('Comic Files : ' + str(comicFiles))
    print('Volume Folders : '+str(volumeFolders))

def get_directory_structure(rootdir):
    """
    Creates a nested dictionary that represents the folder structure of rootdir
    """
    dir = {}
    rootdir = rootdir.rstrip(os.sep)
    start = rootdir.rfind(os.sep) + 1
    for path, dirs, files in os.walk(rootdir):
        folders = path[start:].split(os.sep)
        subdir = dict.fromkeys(files)
        parent = functools.reduce(dict.get, folders[:-1], dir)
        parent[folders[-1]] = subdir
    return dir

def get_one_level(path):
    if os.path.isdir(path):
        d = {}
        for name in os.listdir(path):
            if name != '.DS_Store':
                d[name] = get_one_level(os.path.join(path, name))
    else:
        d = os.path.getsize(path)
    return d

def get_manually(path):
    if os.path.isdir(path):
        
        
        d ={}
        for name in os.listdir(path):
            if name != '.DS_Store':
                folder_name = name.split(' (')[0]                
                fileList = os.listdir(os.path.join(path, name))
                filesDict = {}
                
                for idx, entry in enumerate(fileList):
                    #fileList[idx] = fileList[idx].split('.')[0]
                    #tempDict = {}
                    comicName = os.path.splitext(os.path.basename(fileList[idx]))[0]
                    #tempDict['name'] = comicName
                    year_from_file = comicName[comicName.find("(")+1:comicName.find(")")]
                    if (len(year_from_file) == 4): #Stupid way of doing but good for now.
                        filesDict[comicName] = year_from_file
                    else:
                        filesDict[comicName] = 0
                    #filesDict[idx] = tempDict
                fin_temp_dict = {}
                fin_temp_dict['comics_list'] = filesDict
                d[folder_name] = fin_temp_dict
    return d

In [None]:
neatdict = get_manually(root)
#pp.pprint(neatdict)

In [None]:
# #Fetch metadata from Comic Vine server
# for volumes in list(neatdict.keys()):        
#     response = cv.search(volumes, resources=['volume'])
#     neatdict[volumes]['returned'] = response.results

In [None]:

# for volumes in list(neatdict.keys()):
#     expected_num_comics = len(neatdict[volumes]['comics_list'])
#     years = list(map(int, neatdict[volumes]['comics_list'].values()))
#     print('Volumes currently being scrapped : '+volumes)
#     best_index = 0
#     for indx, each_result in enumerate(neatdict[volumes]['returned']):    
#         each_result['match_score'] = 0.0
#         if expected_num_comics == each_result['count_of_issues']:
#             each_result['match_score'] += 20.0        
#         if (min(years)<=int(each_result['start_year'])<=max(years)):
#             each_result['match_score'] +=20.0
#         #print(each_result['match_score'])
#         if each_result['match_score']>np.array(neatdict[volumes]['returned'])[best_index]['match_score']: #Also assumed that search results are coming back sorted.
#             best_index = indx
#         neatdict[volumes]['best_search']=np.array(neatdict[volumes]['returned'])[best_index]
        
    #del neatdict[volumes]['returned'] 

In [None]:
for volumes in list(neatdict.keys()):
    print('\n\n'+volumes)
    del neatdict[volumes]
    pp.pprint(neatdict[volumes].keys())

In [None]:
pp.pprint(neatdict['Inhumans']['best_search'])

In [None]:
import requests
params = {'api_key':api_key,
         'format': 'json',
         }

api_url = 'https://comicvine.gamespot.com/api/volume/4050-24715/'
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:7.0) '
                             'Gecko/20130825 Firefox/36.0'}

issueresponse = requests.get(api_url, headers=HEADERS, params=params)
fetched_issues = issueresponse.json()
pp.pprint(fetched_issues['results'])

In [None]:
issue_params = {'api_key':api_key,
         'format': 'json'}
issue_api_url = 'https://comicvine.gamespot.com/api/issue/4000-45868/'
per_issue_response = requests.get(issue_api_url, headers=HEADERS, params=issue_params)
per_issue_json = per_issue_response.json()
pp.pprint(per_issue_json['results'])

In [None]:
for each_comic in storresponses:        
    print('\nComic : '+each_comic)
    
    for each_search_result in storresponses[each_comic].results:
        params = []        
        if each_search_result['resource_type'] == 'volume':    
            params.append(each_search_result['resource_type'])
            if 'start_year' in each_search_result:
                 params.append(each_search_result['start_year'])
            if 'count_of_issues' in each_search_result:
                params.append(each_search_result['count_of_issues'])  
            if 'volume' in each_search_result:    
                params.append(each_search_result['volume']['name'])
            if 'name' in each_search_result:
                params.append(each_search_result['name'])
            if 'id' in each_search_result:
                params.append(each_search_result['id'])
            if 'issue_number' in each_search_result:
                params.append(each_search_result['issue_number'])
            if 'deck' in each_search_result:
                params.append(each_search_result['deck'])        
        if len(params)>0:
            print(params)

In [None]:
correct_searches = {}
#Should consist of 3 keys. Name, API results, Confidence Score (0-100) Confidence can be generated from match levels with file name. Specifically, title, issues number  and cover date.

for each_comic in responses:        
    print('\nComic : '+each_comic)
    
    for each_search_result in responses[each_comic].results:
        params = []
        if each_search_result['resource_type'] == 'issue':    
            params.append(each_search_result['resource_type'])
        
            if 'cover_date' in each_search_result:
                if each_search_result['cover_date']:
                    cover_date = datetime.datetime.strptime(each_search_result['cover_date'], '%Y-%m-%d')
                    params.append(cover_date.year)
            if 'volume' in each_search_result:    
                params.append(each_search_result['volume']['name'])
 #           if 'name' in each_search_result:
 #               params.append(each_search_result['name'])
#            if 'issue_number' in each_search_result:
#                params.append(each_search_result['issue_number'])
        if len(params)>0:
            print(params)


In [3]:
from unrar import rarfile

LookupError: Couldn't find path to unrar library.

In [None]:
from pprint import PrettyPrinter as pp

In [None]:
class ReadingList(object):
    def __init__(self, title="Untitled List"):
        self.reading_list = []
        self.title
    
    def add_entry(ComicObject):
        reading_list.append(ComicObject.title)
    