## Background: 
In the fast-evolving landscape of digital content, effective search engines play a pivotal role in connecting users with relevant information. For Google, providing a seamless and accurate search experience is paramount. This project focuses on improving the search relevance for video subtitles, enhancing the accessibility of video content.

## Objective:
Develop an advanced search engine algorithm that efficiently retrieves subtitles based on user queries, with a specific emphasis on subtitle content. The primary goal is to leverage natural language processing and machine learning techniques to enhance the relevance and accuracy of search results.

# Importing the required libraries

In [7]:
import sqlite3
import pandas as pd
import numpy as np
import requests
import re
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Step 1 - Reading the Tables from Database file****

In [8]:
# Read the code below and write your observation in the next cell

conn = sqlite3.connect("eng_subtitles_database.db")
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

<sqlite3.Cursor at 0x17507e7c0>

[]


### In the above cell, I am able to read the table inside the database. As mentioned earlier, table name is zipfiles. We also know from README.txt that this table contains three columns: 'num', 'name' and 'content'.****

# Step 2 - Reading the columns of Table

In [9]:
cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

<sqlite3.Cursor at 0x17507e7c0>

### The above code helps in checking the column names in the database table.
Let's now use SELECT * FROM zipfiles to read all the data into a df variable.

# Step 3 - Loading the Database Table inside a Pandas DataFrame

In [10]:
df = pd.read_sql_query("""SELECT * FROM zipfiles""", conn)
df.head()

DatabaseError: Execution failed on sql 'SELECT * FROM zipfiles': no such table: zipfiles

In [None]:
df.info()

### Looks like the content column do not contain the subtitles text. Instead as mentioned in README.txt, it might be latin-1 encoded.

# Step 4 - Printing content of 0th Row

In [None]:
b_data = df.iloc[0, 2]

# here 2 represent the index of content column
# 0 represents the row number


###  From the content, it appears to start with the bytes "PK\x03......", which suggests that it might be a ZIP archive file.

# Step 5 - Unzipping the content of 385th row and decoding using latin-1

In [None]:
import zipfile
import io

# Assuming 'content' is the binary data from your database
binary_data = df.iloc[385, 2]

# Decompress the binary data using the zipfile module
with io.BytesIO(binary_data) as f:
    with zipfile.ZipFile(f, 'r') as zip_file:
        # Reading only one file in the ZIP archive
        subtitle_content = zip_file.read(zip_file.namelist()[0])

# Now 'subtitle_content' should contain the extracted subtitle content
print(subtitle_content.decode('latin-1'))  # Assuming the content is latin-1 encoded text

# Step 6 - Applying the above Function on the Entire Data

In [None]:
import zipfile
import io

count = 0

def decode_method(binary_data):
    global count
    # Decompress the binary data using the zipfile module
    # print(count, end=" ")
    count += 1
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            # Assuming there's only one file in the ZIP archive
            subtitle_content = zip_file.read(zip_file.namelist()[0])

    # Now 'subtitle_content' should contain the extracted subtitle content
    return subtitle_content.decode('latin-1')  # Assuming the content is UTF-8 encoded text

In [None]:
df['file_content'] = df['content'].apply(decode_method)

df.head()

In [None]:
df.info()

In [None]:
df.tail()

# Step 7- Slice the DataFrame to get 30% of the data and store it in another DataFrame using the iloc method

In [None]:
sliced_data = df[:26000]

In [None]:
sliced_data

In [None]:
df.iloc[0,3]

# Data Preprocessing¶
# Data Cleaning

# Step 1 : Removing the timestamp from file_content column using regexx

In [None]:
import re

# Define the regex pattern
pattern = r'\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}\s*'

# Apply the regex pattern to the specified column
sliced_data['cleaned_text'] = sliced_data['file_content'].apply(lambda x: re.sub(pattern, '', x))

# Display the cleaned DataFrame
sliced_data

In [None]:
pip install nltk

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
from bs4 import BeautifulSoup
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tqdm import tqdm, tqdm_notebook
from sentence_transformers import SentenceTransformer, util

In [None]:
sliced_data

In [None]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # remove timestamps from subtitle documents
    cleaned_text = re.sub(r'\d+:\d+:\d+,\d+ --> \d+:\d+:\d+,\d+', '', text)
    # Remove line numbers
    cleaned_text = re.sub(r'\d+\s*', '', text)
    # Remove HTML tags
    cleaned_text = BeautifulSoup(cleaned_text, "html.parser").get_text(separator=" ")
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    cleaned_text = re.sub(r'[ï]', '', cleaned_text)
    cleaned_text = re.sub(r'[âª]', '', cleaned_text)
    # Remove extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text.strip()

# Apply preprocessing to 'content' column
sliced_data['processed_content'] = sliced_data['cleaned_text'].apply(preprocess_text)

# Display the preprocessed data
print(sliced_data['processed_content'])


In [None]:
sliced_data

In [None]:
sliced_data = sliced_data.drop('content', axis=1)


In [None]:
sliced_data .head()

# Document Chunking

In [None]:
def chunk_document(text, chunk_size=500, overlap=50):
    chunks = []
    words = word_tokenize(text)
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

#Apply chunking to each subtitle document
chunked_data = sliced_data['processed_content'].apply(chunk_document)

In [None]:
sliced_data .head()

# Saving the Chunked Subtitle Data in a CSV file

In [None]:
# Specify the file path for the CSV file
output_csv2_file = 'cleaned_chunked_subtitle_data.csv'

# Write the 'cleaned_text' column to a CSV file
sliced_data.to_csv(output_csv2_file, index=False, header=True)

print(f"Cleaned subtitle data has been saved to {output_csv2_file}.")

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('cleaned_chunked_subtitle_data.csv')

# Print the first few rows of the DataFrame
df.head()

In [None]:
df.shape

In [None]:
sliced_data.iloc[0,4]

In [None]:
!pip install sentence-transformers


# Generating Text Vectors Using BERT based Sentence Transformer

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
sliced_data['doc_vector_pretrained_bert'] = sliced_data.processed_content.apply(model.encode)

In [None]:
sliced_data.head()

In [None]:
sliced_data.to_csv('search.csv')

In [None]:
import pandas as pd
df=pd.read_csv('search.csv')
df

In [None]:
sliced_data.head()

# Creating Query Embeddings

In [None]:
def search(query, data, embeddings, model):
    
    query_embedding = model.encode([query])[0]
    similarities = cosine_similarity([query_embedding], embeddings)
    
    top_n = 10
    top_indices = np.argsort(similarities[0])[-top_n:][::-1]  
    results = [(data['name'][i], similarities[0][i]) for i in top_indices]
    
    return results

In [None]:
embeddings = np.array(sliced_data['doc_vector_pretrained_bert'].tolist())

In [None]:
embedding_dict = {}
for i, embedding in enumerate(embeddings):
    embedding_dict[i] = embedding

for i in range(1):
    print(f"Embedding {i}: {embedding_dict[i]}")

# Calculating Cosine Similarity Score

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

query = input("Enter your search query of English movies and series:")
search_results = search(query, sliced_data, embeddings, model)
for result in search_results:
    print("Document:", result[0])
    print("Similarity Score:", result[1])
    print()

In [None]:
ids = sliced_data.index.astype(str).tolist()
documents = sliced_data['processed_content'].tolist()
metadata = sliced_data.drop(['file_content','cleaned_text','processed_content','doc_vector_pretrained_bert'], axis = 1).to_dict(orient = 'records')

In [None]:
documents[0]

In [None]:
embeddings[0]

# Storing the Vectors generated using ChromaDB database

In [None]:
import chromadb
client = chromadb.PersistentClient(path="Embeddings")

In [None]:
collection = client.create_collection(name="SubtitleSearch_Engine", metadata={"hnsw:space": "cosine"})

In [None]:
embeddings_as_lists = [embedding.tolist() for embedding in embeddings]

In [None]:
for i, embedding in enumerate(embeddings_as_lists):

    # Add the embeddings list to your collection
    collection.add(
            documents=documents[i],
            embeddings=embeddings_as_lists[i],
            ids=ids[i],
            metadatas=metadata[i]
        )

In [None]:
results = collection.query(query_texts=[" through abraham noah moses and through jesus christ 571 why should we be so surprised that god speaks to u now through muhammad 572 who taught you those name 573 they are named in the quran"],
                           n_results=10)

In [None]:
results