In [None]:
from rag_eng import read_dot_env
import requests
import os
import json
from typing import List, Optional
import base64

In [18]:
read_dot_env()
access_token = os.getenv("WIKI_MEDIA_ACCESS_TOKEN")
assert access_token is not None

In [19]:
def url_builder(protocol: str, subdomains: Optional[List[str]], second_level_domain: str, top_level_domain: str, sub_directories: Optional[List[str]]):
    assert protocol == 'http' or protocol == 'https'
    assert second_level_domain is not None and type(second_level_domain) is str
    assert top_level_domain is not None and type(top_level_domain) is str
    url = f"{protocol}://"
    if subdomains is not None and len(subdomains) > 0:
        for sub in subdomains:
            assert type(sub) is str
            url = f"{url}{sub}."
    url = f"{url}{second_level_domain}.{top_level_domain}"
    if sub_directories is not None and len(sub_directories) > 0:
        for sub in sub_directories:
            assert type(sub) is str
            url = f"{url}/{sub}"
    return url


## Failing token

In [None]:
import requests

# Step 1: Get access token using client credentials
client_id = "your_client_id"
client_secret = "your_client_secret"

# Create basic auth header
credentials = base64.b64encode(f"{client_id}:{client_secret}".encode()).decode()

token_headers = {
    'Authorization': f'Basic {credentials}',
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'YourAppName/1.0 (your@email.com)'
}

token_data = {
    'grant_type': 'client_credentials'
}

# Get token
token_response = requests.post('https://meta.wikimedia.org/w/rest.php/oauth2/access_token', 
                              headers=token_headers, 
                              data=token_data)

if token_response.status_code == 200:
    access_token = token_response.json()['access_token']
    
    # Now use this token for API calls
    api_headers = {
        'Authorization': f'Bearer {access_token}',
        'User-Agent': 'YourAppName/1.0 (your@email.com)'
    }
    
    response = requests.get('https://api.wikimedia.org/core/v1/wikipedia/en/search/page', 
                           headers=api_headers, 
                           params={'q': 'python'})
    print(response.json())
else:
    print("Token request failed:", token_response.text)

The below was failing with my access token so I created the above but you do get 500 free requests per hour so default to the free API for now.

[wikimedia docs](https://api.wikimedia.org/wiki/Searching_for_Wikipedia_articles_using_Python) are good to help.

In [None]:
language_code = 'en'
search_query = 'donald trump epstein'
number_of_results = 1
headers = {
    # "Authorization": f"Bearer {access_token.strip()}",
    "User-Agent": "IU-AUTONOMOUS-FACT-CHECKER (kevin.sullivan@sullivansoftware.dev)",
    "Accept": "*/*",
}

url = url_builder('https', ['api'], 'wikimedia', 'org', ['core', 'v1', 'wikipedia', 'en', 'search', 'page'])
params = {'q': search_query, 'limit': number_of_results}

print(url)

response = requests.get(url, headers=headers, params=params)

https://api.wikimedia.org/core/v1/wikipedia/en/search/page


In [28]:
response

<Response [200]>

In [29]:
response.json()

{'pages': [{'id': 51978750,
   'key': 'Donald_Trump_sexual_misconduct_allegations',
   'title': 'Donald Trump sexual misconduct allegations',
   'excerpt': '<span class="searchmatch">Donald</span> <span class="searchmatch">Trump</span> of rape, kissing and groping without consent; looking under women&#039;s skirts; and walking in on naked teenage pageant contestants. <span class="searchmatch">Trump</span> has',
   'matched_title': None,
   'anchor': None,
   'description': '',
   'thumbnail': None}]}