In [None]:
from bs4 import BeautifulSoup
import requests
from neo4j import GraphDatabase
import re
import time
import json
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Neo4j connection details
URI = "neo4j://localhost:7687"
USERNAME = "neo4j"
PASSWORD = os.getenv("NEO4J_PASSWORD") 
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    print("Please set your OPENAI_API_KEY environment variable")



In [2]:
import os
from openai import OpenAI
from typing import Dict, List

class EntityExtractor:
    def __init__(self, api_key: str):
        """
        Initialize the OEISEntityExtractor with OpenAI API key.
        Specialized for extracting entities from OEIS (Online Encyclopedia of Integer Sequences) content.
        
        Args:
            api_key (str): Your OpenAI API key
        """
        self.client = OpenAI(api_key=api_key)
        
    def extract_entities(self, text: str) -> Dict[str, List[str]]:
        """
        Extract entities from OEIS text content using OpenAI's chat completion.
        
        Args:
            text (str): Input OEIS text to extract entities from
            
        Returns:
            Dict[str, List[str]]: Dictionary containing categorized OEIS-specific entities
        """
        # 4. Mathematical formulas and definitions. Make them 1 line maximum.
        # Example:  In the text "... CROSSREFS Cf. A071111. Sequence in context: A069866 A125772 A233282 * A094947 A231474 A092621 Adjacent sequences:  A000997 A000998 A000999 * A001001 A001002 A001003", we need to extract A071111, and whatever afer 'Adjacent sequences', i.e A000997, A000998, A000999, A001001, A001002, A001003
        prompt = f"""Please analyze the following text from the OEIS (Online Encyclopedia of Integer Sequences) 
        and extract relevant entities. Pay special attention to:
        
        1. Sequence IDs, which are important. Ignore IDs without any context (e.g., A000045, A001000)
        2. Mathematical concepts and terms
        3. Authors and contributors.
        4. Cross-references to other sequences. You can igonre the 'Sequence in context' part, but consider the 'Cf.' and the 'Adjacent sequences' part. 
        5. Keywords in the KEYWORDS section.
        
        Format each category of entities as a JSON object with appropriate categorization.
        
        Text to analyze:
        {text}

        Please provide only the JSON output with the following structure:
        {{
            "sequence_ids": [],
            "mathematical_concepts": [],
            "authors": [],
            "cross_references": [],
            "keywords":[],
        }}"""

        try:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo-0125",  # Using a specific model version
                messages=[
                    {"role": "system", "content": "You are an expert at analyzing OEIS (Online Encyclopedia of Integer Sequences) content and extracting relevant mathematical and technical entities. Respond only with structured JSON."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=500,
                response_format={"type": "json_object"}  # Ensure JSON response
            )
            
            # Get the response content
            result = response.choices[0].message.content
            
            # Convert the JSON string response to Python dictionary
            import json
            try:
                entities = json.loads(result)
                return entities
            except json.JSONDecodeError:
                return {"error": "Failed to parse JSON response"}
                
        except Exception as e:
            return {"error": f"API call failed: {str(e)}"}


In [None]:

# Create an instance of EntityExtractor
extractor = EntityExtractor(api_key)

# Example text
sample_text = """
"""

entities = extractor.extract_entities(sample_text)

# Print results
import json
print(json.dumps(entities, indent=2))

In [None]:
init_time = time.time()
with driver.session() as session:

    for i in range(1, 4):  # Start from 1 and go up to 1000

        sequence_number = f"A{i:06d}"  # Format number with leading zeros (6 digits)
        print(sequence_number)



        url = "https://oeis.org/"+sequence_number
        print(url)
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        # seqname = soup.find("div", class_="seqname")
        # seqdata = soup.find("div", class_="seqdata")
        sequence = soup.find(class_='sequence')
        web_text = sequence.get_text(strip=True)

        extractor = EntityExtractor(api_key)

        entities = extractor.extract_entities(web_text)


        ## Implement your code here

        
print(time.time()-init_time, "secs")


A000001
https://oeis.org/A000001
{
  "sequence_ids": [
    "A000001"
  ],
  "mathematical_concepts": [
    "groups",
    "order",
    "nonisomorphic subgroups",
    "combinatorial species",
    "group number",
    "minimal order attaining",
    "conjecture",
    "finite rings",
    "isomorphism types",
    "primes",
    "asymptotics",
    "cyclic groups",
    "dihedral groups",
    "quaternion groups",
    "symmetric groups",
    "MAPLE",
    "MATHEMATICA",
    "Magma",
    "GAP"
  ],
  "authors": [
    "Lekraj Beedassy",
    "Nicolae Boicu",
    "J. H. Conway",
    "Heiko Dietrich",
    "E. A. O'Brien",
    "Daniel Forgues",
    "Muniru A Asiru",
    "N. J. A. Sloane",
    "Jorge R. F. F. Lopes",
    "Mitch Harris",
    "R. J. Mathar",
    "Michael Somos",
    "John Cannon",
    "Harvey P. Dale",
    "Derek Holt",
    "David Applegate"
  ],
  "cross_references": [
    "A046057",
    "A027623",
    "A350638",
    "A143928",
    "A350115",
    "A349495",
    "A350245",
    "A350422",
  

In [5]:
url = "https://oeis.org/"+'A053644'
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# seqname = soup.find("div", class_="seqname")
# seqdata = soup.find("div", class_="seqdata")
sequence = soup.find(class_='sequence')
sequence.get_text(strip=True)

https://oeis.org/A053644


"A053644Most significant bit of n, msb(n); largest power of 2 less than or equal to n; write n in binary and change all but the first digit to zero.1170, 1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64(list;graph;refs;listen;history;text;internal format)OFFSET0,3COMMENTSExcept for the initial term, 2^n appears 2^n times. -Lekraj Beedassy, May 26 2005a(n) is the smallest k such that row k in triangleA265705contains n. -Reinhard Zumkeller, Dec 17 2015a(n) is the sum of totient function over powers of 2 <= n. -Anthony Browne, Jun 17 2016Given positive n, reverse the bits of n and divide by 2^floor(log_2 n). Numerators are inA030101. Ignoring the initial 0, denominators are in this sequence. -Alonso del Arte, Feb 11 2020LINKSReinhard Zumkeller,Table of n, a(n) f

"A053644Most significant bit of n, msb(n); largest power of 2 less than or equal to n; write n in binary and change all but the first digit to zero.1170, 1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64(list;graph;refs;listen;history;text;internal format)OFFSET0,3COMMENTSExcept for the initial term, 2^n appears 2^n times. -Lekraj Beedassy, May 26 2005a(n) is the smallest k such that row k in triangleA265705contains n. -Reinhard Zumkeller, Dec 17 2015a(n) is the sum of totient function over powers of 2 <= n. -Anthony Browne, Jun 17 2016Given positive n, reverse the bits of n and divide by 2^floor(log_2 n). Numerators are inA030101. Ignoring the initial 0, denominators are in this sequence. -Alonso del Arte, Feb 11 2020LINKSReinhard Zumkeller,Table of n, a(n) for n = 0..10000N. J. A. Sloane,TransformsRalf Stephan,Some divide-and-conquer sequences ...Ralf Stephan,Table of generating functionsFORMULAa(n) = a(floor(n / 2)) * 2.a(n) = 2^A000523(n).From n >= 1 onward,A053644(n) =A062383(n)/2.a(0) = 0, a(1) = 1 and a(n+1) = a(n)*floor(n/a(n)). -Benoit Cloitre, Aug 17 2002G.f.: 1/(1 - x) * (x + Sum_{k >= 1} 2^(k - 1)*x^2^k). -Ralf Stephan, Apr 18 2003a(n) = (A003817(n) + 1)/2 =A091940(n) + 1. -Reinhard Zumkeller, Feb 15 2004a(n) = Sum_{k = 1..n} (floor(2^k/k) - floor((2^k - 1)/k))*A000010(k). -Anthony Browne, Jun 17 2016a(2^m+k) = 2^m, m >= 0, 0 <= k < 2^m. -Yosu Yurramendi, Aug 07 2016MAPLEa:= n-> 2^ilog2(n):seq(a(n), n=0..80);  #Alois P. Heinz, Dec 20 2016MATHEMATICAA053644[n_] := 2^(Length[ IntegerDigits[n, 2]] - 1);A053644[0] = 0; Table[A053644[n], {n, 0, 74}] (*Jean-François Alcover, Dec 01 2011 *)nv[n_] := Module[{c = 2^n}, Table[c, {c}]]; Join[{0}, Flatten[Array[nv, 7, 0]]] (*Harvey P. Dale, Jul 17 2012 *)PROG(Haskell)a053644 n = if n <= 1 then n else 2 * a053644 (div n 2)--Reinhard Zumkeller, Aug 28 2014a053644_list = 0 : concat (iterate (\\zs -> map (* 2) (zs ++ zs)) [1])--Reinhard Zumkeller, Dec 08 2012, Oct 21 2011, Oct 17 2010(PARI) a(n)=my(k=1); while(k<=n, k<<=1); k>>1 \\\\Charles R Greathouse IV, May 27 2011(PARI) a(n) = if(!n, 0, 2^exponent(n)) \\\\Iain Fox, Dec 10 2018(Python)def a(n): return 0 if n==0 else 2**(len(bin(n)[2:]) - 1) #Indranil Ghosh, May 25 2017(Magma) [0] cat [2^Ilog2(n): n in [1..90]]; //Vincenzo Librandi, Dec 11 2018(Scala) (0 to 127).map(Integer.highestOneBit(_)) //Alonso del Arte, Feb 26 2020(Python)defA053644(n): return 1<<n.bit_length()-1 if n else 0 #Chai Wah Wu, Jul 27 2022CROSSREFSSeeA000035for least significant bit(n).MASKTRANS transform ofA055975(prepended with 0), MASKTRANSi transform ofA048678.Bisection ofA065267,A065279,A065291,A072376.First differences ofA063915. Cf.A076877,A073121.This is Guy Steele's sequence GS(5, 5) (seeA135416).Equals for n >= 1 the first right hand column ofA160464. -Johannes W. Meijer, May 24 2009Diagonal ofA088370. -Alois P. Heinz, Oct 28 2011Cf.A265705,A000010.Sequence in context:A309195A367026A028397*A279170A292254A292942Adjacent sequences:A053641A053642A053643*A053645A053646A053647KEYWORDnonn,nice,easyAUTHORHenry Bottomley, Mar 22 2000STATUSapproved"