In [1]:
from whoosh.index import create_in
from whoosh.fields import *
import os
import re
import json
import requests

## Index

In [2]:
import numpy as np
import datetime as dt

wd = 'downloads'

In [3]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [4]:
def cleanAutogenSubtitles(subtitles):
    line = [item for item in subtitles[2].split('\n') if item is not ' ']
    try:
        if(len(line[0].split(' --> ')[1]) > 12):
            cleaned = []
            for x in subtitles: 
                line = [item for item in x.split('\n') if item is not ' ']
                line[0] = line[0][:29]
                if(len(line) == 2 and len(cleanhtml(line[1])) == len(line[1])):
                    cleaned.append(line)
            subtitles = cleaned
    except:
        return subtitles
    
    return subtitles

In [5]:
# accepts data as dictionary
def pushCaption(data, user, pwd):
    location = 'captions'
    url = 'http://127.0.0.1:8000/'+location+'/'
    r = requests.post(url, json=data, auth=(user, pwd))
    try:
        return r.json()
    except: 
        print(r)
        return r

### Gather Data

In [6]:
next(os.walk('.'))[1]

subdir_ls = []
for subdir in os.walk(wd):
    if len(subdir[2]) > 0:
        for file in subdir[2]:
            subdir_ls += [(subdir[0], file)]

In [7]:
#part 0: Create dict of files

"""
Data Structure:
{
    
        title: [string],
        captions: [txt],
        description: [file path],
        thumbnail: [file path],
        uploader: [string],
        id: [string]

}
""" 

file_content_dict = {}
for i, file_snippet in enumerate(subdir_ls):
    print(str(i+1)+' of '+str(len(subdir_ls)), end='\r')
    
    file = file_snippet[1]
    if file.split('.')[-1] == 'vtt' or file.split('.')[-1] == 'json':
        file_label = "'".join(file.split('.')[0:-2])
    else:
        file_label = "'".join(file.split('.')[0:-1])
    
    try: 
        file_content_dict[file_label]
    except:
        file_content_dict[file_label] = {}
    
    # captions
    if file[-4:] == '.vtt':
        vtt0 = open(wd+"/"+file, "r", encoding='utf-8').read()
        vtt1 = cleanAutogenSubtitles(vtt0.split('\n\n'))
        if vtt1 != ['']:
            vtt2 = []
            for x in vtt1:
                if(len(x)>0):
                    try:
                        tmp = x.split('\n')
                        timepoint = tmp[0].split(' --> ')
                        timepoint = timepoint[0] + '\t' + timepoint[1]   #convert to string
#                         label = file + '\t' + timepoint
                        label = timepoint
                        vtt2.append((label, ' '.join(tmp[1:]).lower()))
                    except:
                        continue

            vtt1 = vtt2
        file_content_dict[file_label]['captions'] = vtt1
        
    # thumbnail
    elif file[-4:] == '.jpg':
        file_content_dict[file_label]['thumbnail'] = file_snippet[0] + '/' + file
    
    # description
    elif file[-12:] == '.description':
        file_content_dict[file_label]['description'] = file_snippet[0] + '/' + file
    
    # additional info
    elif file[-10:] == '.info.json':
        contents = json.loads(open(wd+"/"+file, "r", encoding='utf-8').read())
        file_content_dict[file_label]['uploader'] = contents['uploader']
        file_content_dict[file_label]['id'] = contents['id']
        

23029 of 23029

### Create a new index

In [14]:
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")
ix = create_in("indexdir", schema)

In [9]:
# writer = ix.writer()
length = len(file_content_dict)
for i, item in enumerate(file_content_dict):
    print(f'{i+1} of {length}', end='\r')
    captions = ''
    try:
        for caption in file_content_dict[item]['captions']:
            captions += caption[1] + ' '
    except:
        continue
    
    try:
        data = {
            'title': item,
            'uploader': file_content_dict[item]['uploader'],
            'videoid': file_content_dict[item]['id'],
            'thumbnail': file_content_dict[item]['thumbnail'],
            'description': file_content_dict[item]['description'],
            'captions': captions
        }
        pushCaption(data, user='admin', pwd='testcase')
#         break
    except Exception as e:
        print(e)
#     writer.add_document(title=file_label, content=captions)

# writer.commit()

5813 of 5813

### Open an existing index

In [1]:
import whoosh.index as index

ix = index.open_dir("indexdir")

### Run Search

In [2]:
from whoosh.qparser import QueryParser

In [3]:
def toseconds(timepoint):
    tmp = timepoint.split('.')[0].split(':')
    hours = int(tmp[0])
    minutes = int(tmp[1])
    seconds = int(tmp[2])
    return str(hours*360 + minutes*60 + seconds)

def searchCorpus(q):
    with ix.searcher() as searcher:
        query = QueryParser('content', ix.schema).parse(q)
        results = searcher.search(query, limit=10)
        results.fragmenter.charlimit = None
        # Show more context before and after
        results.fragmenter.surround = 400
        for hit in results:
            img=mpimg.imread(file_content_dict[hit['title']]['thumbnail'])
            imgplot = plt.imshow(img)
            plt.show()
            print(hit['title'][:-12] + '\n')

            filecontents = ''
            for line in file_content_dict[hit['title']]['captions']:
                filecontents += '\n' + line[0] + '\t' + line[1]
            
            snippets = hit.highlights('content', text=filecontents, top=5).split('...')
            snippets = [cleanhtml(x) for x in snippets]
            for snippet in snippets:
                surrounding_lines = snippet.split('\n')[1:-1]
                for i, line in enumerate(surrounding_lines):
                    tmp = line.split('\t')
                    print('\t' + tmp[3])
                    
                    if(i == 0):
                        video_id = tmp[0][-18:-7]
                        time_start = toseconds(tmp[1])
                
                try:
                    if(len(surrounding_lines) > 0):
                        print('\thttps://app.chimeraeditor.com/player?privacy=private&v=' + video_id + '&t=' + time_start + '\n')
                except:
                    continue
                
            
        return results

In [4]:
query = input('Search: ')
searchCorpus(query)

Search: biology


<Top 10 Results for Term('content', 'biology') runtime=0.0029712489999838>