# Google: 'LK Hadith Corpus Github'

In [None]:
!git clone https://github.com/ShathaTm/LK-Hadith-Corpus.git

Cloning into 'LK-Hadith-Corpus'...
remote: Enumerating objects: 409, done.[K
remote: Counting objects: 100% (409/409), done.[K
remote: Compressing objects: 100% (388/388), done.[K
remote: Total 409 (delta 48), reused 351 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (409/409), 13.69 MiB | 12.37 MiB/s, done.
Resolving deltas: 100% (48/48), done.


In [None]:
import glob
path = '/content/LK-Hadith-Corpus'
files = sorted(glob.glob(path+'//**//*.csv', recursive=True))

In [None]:
import re

def clean_text(text):
  text = text.lower()
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # remove punctuations
  text = re.sub(r'\s+', ' ', text)           # removes extra space
  return text

In [None]:
columns = [
    'Chapter_Number', 'Chapter_English', 'Chapter_Arabic',
    'Section_Number', 'Section_English', 'Section_Arabic',
    'Hadith_Number',
    'English_Hadith', 'English_Isnad', 'English_Matn', 'English_Grade',
    'Arabic_Hadith', 'Arabic_Isnad', 'Arabic_Matn', 'Arabic_Grade'
]

In [None]:
import pandas as pd
columns.append('Cleaned_Hadith')
all_hadith = []

for file in files:
  df = pd.read_csv(file, names=columns, skiprows=1)
  df['Cleaned_Hadith'] = df['English_Hadith'].astype(str).apply(clean_text)

  all_hadith.extend(df[columns].values.tolist())
  # all_hadith.append(df[['Chapter_Number', 'Chapter_English']])

In [None]:
hadith_df = pd.DataFrame(all_hadith, columns=columns)
hadith_df.head(2)

Unnamed: 0,Chapter_Number,Chapter_English,Chapter_Arabic,Section_Number,Section_English,Section_Arabic,Hadith_Number,English_Hadith,English_Isnad,English_Matn,English_Grade,Arabic_Hadith,Arabic_Isnad,Arabic_Matn,Arabic_Grade,Cleaned_Hadith
0,1.0,Purification (Kitab Al-Taharah),كتاب الطهارة,1.0,Seclusion While Relieving Oneself,باب التَّخَلِّي عِنْدَ قَضَاءِ الْحَاجَةِ,1.0,Narrated Mughirah ibn Shu'bah: When the Prophe...,Narrated Mughirah ibn Shu'bah:,When the Prophet (ﷺ) went (outside) to relieve...,حَدَّثَنَا عَبْدُ اللَّهِ بْنُ مَسْلَمَةَ بْنِ...,حَدَّثَنَا عَبْدُ اللَّهِ بْنُ مَسْلَمَةَ بْنِ...,أَنَّ النَّبِيَّ صلى الله عليه وسلم كَانَ إِذَ...,,Hasan Sahih,narrated mughirah ibn shubah when the prophet ...
1,1.0,Purification (Kitab Al-Taharah),كتاب الطهارة,1.0,Seclusion While Relieving Oneself,باب التَّخَلِّي عِنْدَ قَضَاءِ الْحَاجَةِ,2.0,Narrated Jabir ibn Abdullah: When the Prophet ...,Narrated Jabir ibn Abdullah:,When the Prophet (ﷺ) felt the need of relievin...,حَدَّثَنَا مُسَدَّدُ بْنُ مُسَرْهَدٍ، حَدَّثَن...,حَدَّثَنَا مُسَدَّدُ بْنُ مُسَرْهَدٍ، حَدَّثَن...,أَنَّ النَّبِيَّ صلى الله عليه وسلم كَانَ إِذَ...,,Sahih - Authentic,narrated jabir ibn abdullah when the prophet f...


In [None]:
hadith_df.to_csv('cleaned_hadith_data.csv', index=False)

In [None]:
!pip install sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
hadith_df.columns

Index(['Chapter_Number', 'Chapter_English', 'Chapter_Arabic', 'Section_Number',
       'Section_English', 'Section_Arabic', 'Hadith_Number', 'English_Hadith',
       'English_Isnad', 'English_Matn', 'English_Grade', 'Arabic_Hadith',
       'Arabic_Isnad', 'Arabic_Matn', 'Arabic_Grade', 'Cleaned_Hadith'],
      dtype='object')

In [None]:
embeddings = model.encode(hadith_df['Cleaned_Hadith'].values)

In [None]:
import numpy as np
embeddings = np.array(embeddings)

In [None]:
np.save('hadith_embeddings.npy', embeddings)

In [None]:
embeddings = np.load('hadith_embeddings.npy')

In [None]:
# FAISS - Facebook AI for Similarity Search
# !pip install faiss-gpu
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
import faiss

dimensions = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimensions)   # Euclidean Distance

In [None]:
faiss_index.add(embeddings)

In [None]:
faiss.write_index(faiss_index, "faiss_index.index")

In [None]:
def get_similar_hadith(query, count=5, model=model, faiss_index=faiss_index):
  query_embedding = model.encode([query])
  distance, indices = faiss_index.search(query_embedding, count)

  for i in range(count):
    print(f"Hadith {i+1}, Distance: {distance[0][i]}")
    print(hadith_df['English_Hadith'].iloc[indices[0][i]])

In [None]:
get_similar_hadith("How many prayers are there?")

Hadith 1, Distance: 0.7171145081520081
It was narrated that Ibn ‘Abbas said: “Your Prophet (ﷺ) was enjoined to do fifty prayers but he returned to your Lord to make (i.e., reduce) them to five prayers.”
Hadith 2, Distance: 0.7440200448036194
Abu Huraira reported Allah's Messenger (ﷺ) as saying: Prayer said in a congregation is equivalent to twenty-five (prayers) as compared with the prayer said by a single person.
Hadith 3, Distance: 0.7486097812652588
Narrated `Abdullah bin `Umar: Allah's Messenger (ﷺ) said, "The prayer in congregation is twenty seven times superior to the prayer offeredby person alone."
Hadith 4, Distance: 0.8015742301940918
It was narrated from Abu Hurairah that: The Messenger of Allah said: "The prayer in congregation is twenty-five times more virtuous than the prayer of anyone of you on his own."
Hadith 5, Distance: 0.8022949695587158
Narrated Abdullah ibn Umar: There were fifty prayers (obligatory in the beginning); and (in the beginning of Islam) washing seven t