# Test Simple Multi-Threading:
* multi-threading are suitable for non-cpu intensive tasks and I/O tasks.
* since actual search operation is performed by mongodb, using multi-threading should be better than multi-processing

In [32]:
from django.shortcuts import render
import threading
from concurrent.futures import ThreadPoolExecutor
import time

def search(query):
    texts = ['hi, how are you', 'is there anyone there', 'hello']
    results = [text for text in texts if query in text]
    thread_id = threading.current_thread().ident
    print(f'Thread ID: {thread_id}, query: {query}, results: {results}')
    time.sleep(5)
    return results

def parallel_search(query_parts):
    with ThreadPoolExecutor() as executor:
        # Use ThreadPoolExecutor to execute search function for each query part
        results = [result for sublist in executor.map(search, query_parts) for result in sublist]
    return results

result = parallel_search(['hi', 'there'])
print(result)

'''
output: ['hi, how are you', 'is there anyone there']
'''
 

Thread ID: 140329498961600, query: hi, results: ['hi, how are you']
Thread ID: 140329512597184, query: there, results: ['is there anyone there']
['hi, how are you', 'is there anyone there']


"\noutput: ['hi, how are you', 'is there anyone there']\n"

## Multi-Threading Search

In [11]:
from django.shortcuts import render
import threading
from concurrent.futures import ThreadPoolExecutor
from mongo_db_handler import MongoDBHandler
from redis_cache import RedisCache
import time

mongo_handler = MongoDBHandler(collection_name = "pdf_collection", db_name="pdf_engine")
mongo_handler.count_entries()


def parallel_search(query):
    ignore_words = ['how', 'whatever', 'him', 'would', 'they', 'whichever', 'what', 'whysoever', 'have', 'why', 'can', 'our', 'whosoever', 'her', 'whatsoever', 'wherever', 'whyever', 'and', 'whoever', 'that', 'when', 'this', 'which', 'whenever', 'them', 'been', 'his', 'for', 'whensoever', 'the', 'their', 'was', 'but', 'one', 'whosesoever', 'whomsoever', 'whom', 'not', 'all', 'howsoever', 'will', 'you', 'your', 'were', 'with', 'has', 'she', 'from', 'are', 'wheresoever', 'whose', 'had', 'who', 'where', 'there', 'whomever', 'best']
    query_parts = [word for word in list(set(query.split() + [query])) if ( (word not in ignore_words) and len(word)>=3)]   # ['Engineering', 'Physics', 'Engineering Physics']
    print(query_parts)
    reddis_cache = RedisCache()
    with ThreadPoolExecutor() as executor:
        # Use ThreadPoolExecutor to execute search function for each query part
        results = [result for sublist in executor.map(reddis_cache.get, query_parts) for result in sublist]
    return results

query = "Engineering Physics the of dd"

average_time = 0
for i in range(10):
    start_time = time.time()
    results = parallel_search(query)
    stop_time = time.time()
    average_time += stop_time - start_time
average_time /= 10
print(f'avergae time: {average_time} seconds. \n len(results): {len(results)}')

Number of entries in the collection: 51471
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
avergae time: 12.506405329704284 seconds. 
 len(results): 929


## No Multi-threading Search

In [12]:
from django.shortcuts import render
import threading
from concurrent.futures import ThreadPoolExecutor
from mongo_db_handler import MongoDBHandler
from redis_cache import RedisCache
import time

mongo_handler = MongoDBHandler(collection_name = "pdf_collection", db_name="pdf_engine")
mongo_handler.count_entries()

average_time = 0
def not_parallel_search(query):
    ignore_words = ['how', 'whatever', 'him', 'would', 'they', 'whichever', 'what', 'whysoever', 'have', 'why', 'can', 'our', 'whosoever', 'her', 'whatsoever', 'wherever', 'whyever', 'and', 'whoever', 'that', 'when', 'this', 'which', 'whenever', 'them', 'been', 'his', 'for', 'whensoever', 'the', 'their', 'was', 'but', 'one', 'whosesoever', 'whomsoever', 'whom', 'not', 'all', 'howsoever', 'will', 'you', 'your', 'were', 'with', 'has', 'she', 'from', 'are', 'wheresoever', 'whose', 'had', 'who', 'where', 'there', 'whomever', 'best']
    query_parts = [word for word in list(set(query.split() + [query])) if ( (word not in ignore_words) and len(word)>=3)]   # ['Engineering', 'Physics', 'Engineering Physics']
    print(query_parts)
    reddis_cache = RedisCache()
    results = []
    for query_part in query_parts:
        result = reddis_cache.get(query_part)
        results.extend(result)
    return results

query = "Engineering Physics the of dd"
average_time = 0
for i in range(10):
    start_time = time.time()
    results = not_parallel_search(query)
    stop_time = time.time()
    average_time += stop_time - start_time
average_time /= 10
print(f'avergae time: {average_time} seconds. \n len(results): {len(results)}')

Number of entries in the collection: 51471
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
['Engineering', 'Engineering Physics the of dd', 'Physics']
avergae time: 14.878536605834961 seconds. 
 len(results): 929


In [7]:
results[0]

{'id': '1zESpo4jn7Q3OQ9N2EimtzAE05D7eJAnw',
 'title': 'BCT',
 'view_link': 'https://drive.google.com/drive/folders/1zESpo4jn7Q3OQ9N2EimtzAE05D7eJAnw',
 'owners': [{'displayName': 'Raj Dhakal',
   'kind': 'drive#user',
   'isAuthenticatedUser': False,
   'permissionId': '14172705132403258789',
   'emailAddress': 'rajdhakal2056@gmail.com',
   'picture': {'url': 'https://lh3.googleusercontent.com/a-/ALV-UjWBVCyx8Mbgo57jpFt1nDLNTwEapPepUcThCKhpxK7RGQ=s64'}}],
 'userPermission': {'id': 'me',
  'type': 'user',
  'role': 'reader',
  'kind': 'drive#permission',
  'selfLink': 'https://www.googleapis.com/drive/v2/files/1zESpo4jn7Q3OQ9N2EimtzAE05D7eJAnw/permissions/me',
  'etag': '"rZqIqmyDVna0oJomt994saFghgw"',
  'pendingOwner': False},
 'quotaBytesUsed': '0',
 'mimeType': 'application/vnd.google-apps.folder',
 'parents': [],
 'children_list': ['DBMS_Unit2_Lecture1.pdf',
  'hasstring.cpython-37.pyc',
  'rfc8649.py',
  'lab 6.pdf',
  'cred_anonymous.py',
  'openssh_compat',
  'fallback.py',
  '1.

## Average Time by simple search from  mongo_db

In [18]:
import time
from mongo_db_handler import MongoDBHandler
mongo_handler = MongoDBHandler(collection_name = "pdf_collection", db_name="pdf_engine")
mongo_handler.count_entries()
average_time = 0
for i in range(50):
    print(i, end=':')
    start_time = time.time()
    res = mongo_handler.search("physics")
    print(len(list(res)))
    stop_time = time.time()
    duration = stop_time - start_time
    average_time += duration
average_time /= 15
average_time    

Number of entries in the collection: 51471
0:47
1:47
2:47
3:47
4:47
5:47
6:47
7:47
8:47
9:47
10:47
11:47
12:47
13:47
14:47
15:47
16:47
17:47
18:47
19:47
20:47
21:47
22:47
23:47
24:47
25:47
26:47
27:47
28:47
29:47
30:47
31:47
32:47
33:47
34:47
35:47
36:47
37:47
38:47
39:47
40:47
41:47
42:47
43:47
44:47
45:47
46:47
47:47
48:47
49:47


5.98073132832845

## Average Time by search_by_id from mongodb

In [19]:
import json
import time

with open('/home/anon/ioee/pdf_engine_search_result_physics.json', 'r') as f:
    data = json.load(f)

ids = [d['id'] for d in data]


from mongo_db_handler import MongoDBHandler
mongo_handler = MongoDBHandler(collection_name = "pdf_collection", db_name="pdf_engine")
mongo_handler.count_entries()
average_time = 0
for i in range(50):
    print(i, end=':')
    start_time = time.time()
    res = mongo_handler.get_by_ids(ids)
    print(len(list(res)))
    stop_time = time.time()
    duration = stop_time - start_time
    average_time += duration
average_time /= 15
average_time

Number of entries in the collection: 51471
0:47
1:47
2:47
3:47
4:47
5:47
6:47
7:47
8:47
9:47
10:47
11:47
12:47
13:47
14:47
15:47
16:47
17:47
18:47
19:47
20:47
21:47
22:47
23:47
24:47
25:47
26:47
27:47
28:47
29:47
30:47
31:47
32:47
33:47
34:47
35:47
36:47
37:47
38:47
39:47
40:47
41:47
42:47
43:47
44:47
45:47
46:47
47:47
48:47
49:47


3.5937604268391925

## Average Time by entire-search-result-in-redis from Redis insertation

In [20]:
import redis

r = redis.Redis(
host='redis-19089.c212.ap-south-1-1.ec2.cloud.redislabs.com',
port=19089,
password='uUzG92fuoNbpDa2922BTIFnpXyPSgIrO')


In [21]:
import redis
import json

with open('/home/anon/ioee/pdf_engine_search_result_physics.json', 'r') as f:
    data = json.load(f)

r.set('physics', json.dumps(data))
# True




import json
import time

ids = [d['id'] for d in data]

average_time = 0
for i in range(50):
    print(i, end=':')
    start_time = time.time()
    res = json.loads(r.get('physics'))
    print(len(list(res)))
    stop_time = time.time()
    duration = stop_time - start_time
    average_time += duration
average_time /= 15
average_time



# b'bar'

0:47
1:47
2:47
3:47
4:47
5:47
6:47
7:47
8:47
9:47
10:47
11:47
12:47
13:47
14:47
15:47
16:47
17:47
18:47
19:47
20:47
21:47
22:47
23:47
24:47
25:47
26:47
27:47
28:47
29:47
30:47
31:47
32:47
33:47
34:47
35:47
36:47
37:47
38:47
39:47
40:47
41:47
42:47
43:47
44:47
45:47
46:47
47:47
48:47
49:47


2.0722643693288165

## Average Time by ids-from-redis, data-from-mongo

In [17]:
import redis
import json

with open('/home/anon/ioee/pdf_engine_search_result_physics.json', 'r') as f:
    data = json.load(f)

r.set('physics_ids', json.dumps([d['id'] for d in data]))

from mongo_db_handler import MongoDBHandler
mongo_handler = MongoDBHandler(collection_name = "pdf_collection", db_name="pdf_engine")
mongo_handler.count_entries()
average_time = 0
for i in range(50):
    print(i, end=':')
    start_time = time.time()
    ids = json.loads(r.get('physics_ids'))
    res = mongo_handler.get_by_ids(ids)
    print(len(list(res)))
    stop_time = time.time()
    duration = stop_time - start_time
    average_time += duration
average_time /= 15
average_time

Number of entries in the collection: 51471
0:47
1:47
2:47
3:47
4:47
5:47
6:47
7:47
8:47
9:47
10:47
11:47
12:47
13:47
14:47
15:47
16:47
17:47
18:47
19:47
20:47
21:47
22:47
23:47
24:47
25:47
26:47
27:47
28:47
29:47
30:47
31:47
32:47
33:47
34:47
35:47
36:47
37:47
38:47
39:47
40:47
41:47
42:47
43:47
44:47
45:47
46:47
47:47
48:47
49:47


2.4366972128550213

In [25]:
r.set('some thing', 'other thing')
r.get('some thing')

b'other thing'

In [15]:
from redis_cache import RedisCache
cache = RedisCache()
redis_client = cache.redis
# cache.redis.set('test', 'value' * 100000)

# Number of keys in database
cache.redis.dbsize()

# Get all keys
# Get all keys in the database
# all_keys = redis_client.keys('*')
# all_keys

redis_client.get('maam')

b'[{"id": "1XhagkQn6purWMeE1stCR3519K4aoQ7fs", "title": "EPP", "view_link": "https://drive.google.com/drive/folders/1XhagkQn6purWMeE1stCR3519K4aoQ7fs", "owners": [{"displayName": "event and tech gurus", "kind": "drive#user", "isAuthenticatedUser": false, "permissionId": "13430313145412168786", "emailAddress": "drivebook33@gmail.com", "picture": {"url": "https://lh3.googleusercontent.com/a/default-user=s64"}}], "userPermission": {"id": "me", "type": "user", "role": "reader", "kind": "drive#permission", "selfLink": "https://www.googleapis.com/drive/v2/files/1XhagkQn6purWMeE1stCR3519K4aoQ7fs/permissions/me", "etag": "\\"0UUg-z5OskrwpUYQr4Bh9JvauAQ\\"", "pendingOwner": false}, "quotaBytesUsed": "0", "mimeType": "application/vnd.google-apps.folder", "parents": [], "children_list": ["EPP Chapter 1 History of Engineering Practices.pptx", "epp-insight.pdf", "Chapter 1_2 ST_HD.pdf", "CT752_2020-08-13T03-12-25.926Z_The Nepal Engineering Council Act.pdf", "chapters", "Chapter 5.pdf", "uH6aw5yqh71

In [39]:
for n, key in enumerate(reversed(redis_client.keys('*'))):
    print(n, key)# , redis_client.get(key))
#     # redis_client.delete(key)
# redis_client.delete('big fucking data')

0 b'big data'


0

In [27]:
least_used_keys = redis_client.zrange('test', 0, 3)
least_used_keys

[]