# WikipediaRetriever

## Overview
>[Wikipedia](https://wikipedia.org/) is a multilingual free online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. `Wikipedia` is the largest and most-read reference work in history.

This notebook shows how to retrieve wiki pages from `wikipedia.org` into the [Document](https://python.langchain.com/api_reference/core/documents/langchain_core.documents.base.Document.html) format that is used downstream.

### Integration details

import {ItemTable} from "@theme/FeatureTables";

<ItemTable category="external_retrievers" item="WikipediaRetriever" />

## Setup
To enable automated tracing of individual tools, set your [LangSmith](https://docs.smith.langchain.com/) API key:

In [None]:
# os.environ["LANGSMITH_API_KEY"] = getpass.getpass("Enter your LangSmith API key: ")
# os.environ["LANGSMITH_TRACING"] = "true"

### Installation

The integration lives in the `langchain-community` package. We also need to install the `wikipedia` python package itself.

In [None]:
%pip install -qU langchain_community wikipedia

## Instantiation

Now we can instantiate our retriever:

`WikipediaRetriever` parameters include:
- optional `lang`: default="en". Use it to search in a specific language part of Wikipedia
- optional `load_max_docs`: default=100. Use it to limit number of downloaded documents. It takes time to download all 100 documents, so use a small number for experiments. There is a hard limit of 300 for now.
- optional `load_all_available_meta`: default=False. By default only the most important fields downloaded: `Published` (date when document was published/last updated), `title`, `Summary`. If True, other fields also downloaded.

`get_relevant_documents()` has one argument, `query`: free text which used to find documents in Wikipedia

In [None]:
from langchain_community.retrievers import WikipediaRetriever

retriever = WikipediaRetriever()

## Usage

In [None]:
docs = retriever.invoke("TOKYO GHOUL")

In [None]:
print(docs[0].page_content[:400])

Tokyo Ghoul (Japanese: 東京喰種（トーキョーグール）, Hepburn: Tōkyō Gūru) is a Japanese dark fantasy manga series written and illustrated by Sui Ishida. It was serialized in Shueisha's seinen manga magazine Weekly Young Jump from September 2011 to September 2014, with its chapters collected in 14 tankōbon volumes. The story is set in an alternate version of Tokyo where humans coexist with ghouls, beings who loo


In [None]:
def index_encoded_data(self, index, embedding_files, indexing_batch_size):
    allids = []
    allembeddings = np.array([])
    for i, file_path in enumerate(embedding_files):
        print(f"Loading file {file_path}")
        with open(file_path, "rb") as fin:
            ids, embeddings = pickle.load(fin)

        allembeddings = np.vstack((allembeddings, embeddings)) if allembeddings.size else embeddings
        allids.extend(ids)
        while allembeddings.shape[0] > indexing_batch_size:
            allembeddings, allids = self.add_embeddings(index, allembeddings, allids, indexing_batch_size)

    while allembeddings.shape[0] > 0:
        allembeddings, allids = self.add_embeddings(index, allembeddings, allids, indexing_batch_size)

    print("Data indexing completed.")


def add_embeddings(self, index, embeddings, ids, indexing_batch_size):
    end_idx = min(indexing_batch_size, embeddings.shape[0])
    ids_toadd = ids[:end_idx]
    embeddings_toadd = embeddings[:end_idx]
    ids = ids[end_idx:]
    embeddings = embeddings[end_idx:]
    index.index_data(ids_toadd, embeddings_toadd)
    return embeddings, ids

In [None]:
# !pip install faiss-gpu
!pip install faiss-cpu
!pip install sentence_transformers
import faiss
import pandas as pd
import os
import numpy as np
from sentence_transformers import SentenceTransformer

In [None]:
index_folder = "/kaggle/input/wikipedia-faiss-index"

for idx, indexname in enumerate(os.listdir(index_folder)):
    index = faiss.read_index(os.path.join(index_folder, indexname))
    faiss.write_index(index, os.path.join("/kaggle/working/", indexname))
    print(f"Successfullt move the {indexname} from Input to Output")

In [None]:
model_name = "thenlper/gte-small"
sentence_transformer = SentenceTransformer(model_name)
parquet_folder = "/kaggle/input/wikipedia-20230701"

file_names = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'number', 'o', 'other', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

for idx, filename in enumerate(file_names):
    if (idx + 1) >= 22:
        document_embeddings = []

        print(f"Processing file_id: {idx + 1} - file_name: {filename}.parquet ......")

        parquet_path = os.path.join(parquet_folder, f"{filename}.parquet")
        df = pd.read_parquet(parquet_path)

        print(df.columns)
        print("Sample text: ", df.iloc[0]["text"])

        sentences = df["text"].tolist()
        embeddings = sentence_transformer.encode(sentences, normalize_embeddings=True)
        document_embeddings.extend(embeddings)

        del df

        document_embeddings = np.array(document_embeddings).astype("float32")
        index = faiss.IndexFlatIP(document_embeddings.shape[1])
        index.add(document_embeddings)
        faiss_index_path = f"/kaggle/working/wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
        faiss.write_index(index, faiss_index_path)


        print(f"Faiss index saved to '{faiss_index_path}'")

In [None]:
index_folder1 = "/kaggle/input/wikipedia-faiss-index"
index_folder2 = "/kaggle/input/wikipedia-faiss-index"

file_names = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'number', 'o', 'other', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

merged_index = faiss.IndexFlatL2(384)
for idx, filename in enumerate(file_names):
    if (idx + 1) >= 7:
        break

    if (idx + 1) >= 12 and (idx + 1) <= 20:
        indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
        print(f"Merge file {idx + 1} - {indexname}")
        index = faiss.read_index(os.path.join(index_folder2, indexname))

        num_vectors = index.ntotal
        for i in range(num_vectors):
            vec = index.reconstruct(i).reshape(-1, 384)
            vec = np.array(vec).astype("float32")
            merged_index.add(vec)

    else:
        indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
        print(f"Merge file {idx + 1} - {indexname}")
        index = faiss.read_index(os.path.join(index_folder1, indexname))

        num_vectors = index.ntotal
        for i in range(num_vectors):
            vec = index.reconstruct(i).reshape(-1, 384)
            vec = np.array(vec).astype("float32")
            merged_index.add(vec)

    if (idx + 1) == 6:
        merged_index_path = "/kaggle/working/merged_1.index"
        faiss.write_index(merged_index, merged_index_path)

        print(f"Merged index saved to '{merged_index_path}'")

        del merged_index


merged_index = faiss.IndexFlatL2(384)
for idx, filename in enumerate(file_names):
    if (idx + 1) <= 6:
        continue

    if (idx + 1) == 13:
        break

    if (idx + 1) >= 12 and (idx + 1) <= 20:
        indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
        print(f"Merge file {idx + 1} - {indexname}")
        index = faiss.read_index(os.path.join(index_folder2, indexname))

        num_vectors = index.ntotal
        for i in range(num_vectors):
            vec = index.reconstruct(i).reshape(-1, 384)
            vec = np.array(vec).astype("float32")
            merged_index.add(vec)

    else:
        indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
        print(f"Merge file {idx + 1} - {indexname}")
        index = faiss.read_index(os.path.join(index_folder1, indexname))

        num_vectors = index.ntotal
        for i in range(num_vectors):
            vec = index.reconstruct(i).reshape(-1, 384)
            vec = np.array(vec).astype("float32")
            merged_index.add(vec)

    if (idx + 1) == 12:
        merged_index_path = "/kaggle/working/merged_2.index"
        faiss.write_index(merged_index, merged_index_path)

        print(f"Merged index saved to '{merged_index_path}'")

        del merged_index


merged_index = faiss.IndexFlatL2(384)
for idx, filename in enumerate(file_names):
    if (idx + 1) <= 12:
        continue

    if (idx + 1) == 20:
        break

    if (idx + 1) >= 12 and (idx + 1) <= 20:
        indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
        print(f"Merge file {idx + 1} - {indexname}")
        index = faiss.read_index(os.path.join(index_folder2, indexname))

        num_vectors = index.ntotal
        for i in range(num_vectors):
            vec = index.reconstruct(i).reshape(-1, 384)
            vec = np.array(vec).astype("float32")
            merged_index.add(vec)

    else:
        indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
        print(f"Merge file {idx + 1} - {indexname}")
        index = faiss.read_index(os.path.join(index_folder1, indexname))

        num_vectors = index.ntotal
        for i in range(num_vectors):
            vec = index.reconstruct(i).reshape(-1, 384)
            vec = np.array(vec).astype("float32")
            merged_index.add(vec)

    if (idx + 1) == 19:
        merged_index_path = "/kaggle/working/merged_3.index"
        faiss.write_index(merged_index, merged_index_path)

        print(f"Merged index saved to '{merged_index_path}'")

        del merged_index

merged_index = faiss.IndexFlatL2(384)
for idx, filename in enumerate(file_names):
    if (idx + 1) <= 19:
        continue

    if (idx + 1) >= 12 and (idx + 1) <= 20:
        indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
        print(f"Merge file {idx + 1} - {indexname}")
        index = faiss.read_index(os.path.join(index_folder2, indexname))

        num_vectors = index.ntotal
        for i in range(num_vectors):
            vec = index.reconstruct(i).reshape(-1, 384)
            vec = np.array(vec).astype("float32")
            merged_index.add(vec)

    else:
        indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
        print(f"Merge file {idx + 1} - {indexname}")
        index = faiss.read_index(os.path.join(index_folder1, indexname))

        num_vectors = index.ntotal
        for i in range(num_vectors):
            vec = index.reconstruct(i).reshape(-1, 384)
            vec = np.array(vec).astype("float32")
            merged_index.add(vec)

    if (idx + 1) == 28:
        merged_index_path = "/kaggle/working/merged_4.index"
        faiss.write_index(merged_index, merged_index_path)

        print(f"Merged index saved to '{merged_index_path}'")

        del merged_index

In [None]:
merged_index = faiss.IndexFlatL2(384)
# merged_index = faiss.read_index("/kaggle/input/wikipedia-embeddings/merged_1.index")
index_folder = "/kaggle/input/wikipedia-faiss-index"

for idx, indexname in enumerate(os.listdir(index_folder)):
    print(f"Merge file {idx + 1} - {indexname}")
    index = faiss.read_index(os.path.join(index_folder, indexname))

    num_vectors = index.ntotal
    for i in range(num_vectors):
        vec = index.reconstruct(i).reshape(-1, 384)
        vec = np.array(vec).astype("float32")
        merged_index.add(vec)

    del index

merged_index_path = "/kaggle/working/merged.index"
faiss.write_index(merged_index, merged_index_path)

print(f"Merged index saved to '{merged_index_path}'")