In [263]:
import re
import collections
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urlparse

class QuestionExtraction:
    
    def __init__(self,url):
        self.url=url
        self.soup=None
        self.question_elements=None
        self.table_ele=[]
        self.tr_ele=[]
        self.td_ele=[]
        self.questions_list=[]
        self.dominURL=None
        self.libPath=None
        self.qids=[]
        self.qcounter=None
    
    def urlParser(self):
        urlInfo = urlparse(self.url)
        self.domainURL=urlInfo.scheme+'://'+urlInfo.netloc
        self.libPath=self.domainURL+'/'+urlInfo.path.split('/')[1]+'/'
        
    def extractHtml(self):
        response = requests.get(self.url)
        
        try:
            # Check if the request was successful
            if response.status_code == 200:
                html_content = response.text

                # Parse the HTML content using BeautifulSoup
                soup = BeautifulSoup(html_content, 'html.parser')
                return soup
            else:
                return None
        except e:
            return None
    
    # Extract div with rvps elements
    def findQuestionElements(self):
        self.soup=self.extractHtml()
        specific_pattern = re.compile(r'rvps\d+')
        self.question_elements = self.soup.find_all('div', class_=specific_pattern) # Replace with your actual class name for questions
        return self.question_elements
    
    # Extract table elements
    def questionTableElements(self):
        self.urlParser()
        question_elements=self.findQuestionElements()
        
        for question_elem in question_elements:
            #specific_pattern = re.compile(r'rvps\d+')
            question_table = question_elem.find('table')
            self.table_ele.append(question_table)
        return self.table_ele
    
    # Extract tr elements
    def questionTrElements(self,question_table):
        question_trs=question_table.find_all('tr')
        return question_trs
    
    # Extract td elements
    def questionTrTdElements(self,question_tr):
        question_tds=question_tr.find_all('td')
        return question_tds
    
    # Extract all td elements
    def questionTableAllTdEelements(self,question_table):
        question_tds=question_table.find_all('td')
        return question_tds
    
    # Extract all p elements
    def questionTableAllPEelements(self,question_table):
        question_p_tags=question_table.find_all('p')
        return question_p_tags
    
    # Extract Question Text / Image
    def extractQuestionTextORImg(self,qtable_ptags,findIndex):
        qimg_text=''
        if len(qtable_ptags[(findIndex+1):])>0:
            
            for q_ptag in qtable_ptags[(findIndex+1):]:
                if q_ptag.find('img'):
                    qimg_text=self.libPath+''+q_ptag.find('img')['src']
                    break
                    
                if q_ptag.text.strip()!='':
                    qimg_text=q_ptag.text.strip()
                    break
        return qimg_text
    
    # Extract Question Text / Image
    def extractOptionTextORImg(self,qtable_ptags,findIndex):
        option_img_text=''
        if len(qtable_ptags[(findIndex+1):])>0:
            
            for q_ptag in qtable_ptags[(findIndex+1):]:
                if q_ptag.find('img'):
                    option_img_text=self.libPath+''+q_ptag.find('img')['src']
                    break
                
                if q_ptag.text.strip()!='':
                    option_img_text=q_ptag.text.strip()
                    break
                    
        return option_img_text
    
    #Questions
    def questionList(self):
        question_tables=self.questionTableElements()
        
        self.qids=[]
        self.qcounter=0
        
        for question_tindex in range(len(question_tables)):
            question_table=question_tables[question_tindex]
            question_table_ptags=self.questionTableAllPEelements(question_table);
            
            qoptions=[]
            questionFount=False
            
            for qp_index in range(len(question_table_ptags)):
                q_ptag=question_table_ptags[qp_index]
                qpattern = re.compile(r'^\d+\.$')
                
                regex = r"^[A-D\)$]+"
                option_pattern = re.compile(regex)

                if questionFount==False and qpattern.match(q_ptag.text.strip()):
                    qsno=q_ptag.text.strip()
                    qno=int(qsno.replace('.',''))
                    question=self.extractQuestionTextORImg(question_table_ptags,qp_index)
                    questionFount=True
                    
                
                if option_pattern.match(q_ptag.text.strip()):
                    option_index=q_ptag.text.strip()
                    option=self.extractOptionTextORImg(question_table_ptags,qp_index)
                    qoptions.append({
                        'option_index':option_index.replace(')',''),
                        'option':option
                    })
                    
                                  
            qids=list([item['qno'] for item in self.questions_list])
            if qno in qids:
                try:
                    qindex = qids.index(qno)
                except ValueError:
                    qindex = None
#                 print(qno,qindex)
#                 print("New Option length :",len(qoptions))
#                 print(self.questions_list[qindex]['qno'],(qindex))
#                 print("OLD :",len(self.questions_list[qindex]['options']))
                if qindex!=None:
                    try:
                        if len(qoptions) > len(self.questions_list[qindex]['options']):
                            self.questions_list[qindex]['options']=qoptions
                            self.questions_list[qindex]['qtype']='M' if len(qoptions)>0 else 'N'
                            
                        elif len(qoptions)==len(self.questions_list[qindex]['options']):
                            for qp in qoptions:
                                self.questions_list[qindex]['options'].append(qp)
                            
                    except:
                        return None
            else:
                self.questions_list.append({
                        #'qsno':qsno,
                        'qno':qno,
                        'question':question,
                        'options':qoptions,
                        'qtype':'M' if len(qoptions)>0 else 'N'
                    })

#             self.questions_list.append({
#                     'qno':qno,
#                     'question':question,
#                     'options':qoptions,
#                     'qtype':'M' if len(qoptions)>0 else 'N'
#                 })
                
                #print(question_table_ptags[qp_index])
                    
            #print("===========")
        #print(self.qids)
#         if len(self.questions_list)>0:
#             duplicateQids=[q['qno'] for key,q in enumerate(qresult) if q['qno'] in [item['qno'] for item in qresult[:key]]]
#             qids=[item['qno'] for item in self.questions_list]
            
#             dpList=[]
#             for dqindex,dqid in enumerate(duplicateQids):
#                 qIDS=[]
#                 for qindex,qid in enumerate(qids):
#                     if dqid==qid:
#                         qIDS.append({'qindex':qindex,'qno':qid,'qopcount':len(self.questions_list[qindex]['options'])})
#                 dpList.append(qIDS)
            #print(dpList)
        return self.questions_list
        #print(self.questions_list)
            
#             question_trs=self.questionTrElements(question_table)
            
#             for question_tr in question_trs:
#                 question_tr_tds=self.questionTrTdElements(question_tr)
#                 #print(question_tr_tds)
#                 for question_td in question_tr_tds:
#                     question_p_tags=question_td.find_all('p');
#                     qpattern = re.compile(r'^\d+\.$')
                    
#                     if len(question_p_tags) == 1 and qpattern.match(question_p_tags[0].text.strip()):
#                             qsno=question_p_tags[0].text.strip()
#                             qno=qsno.replace('.','')
#                             question_img=question_tr.find('img')
#                             question=question_img['src']
#                             question_all_tds=self.questionTableAllTdEelements(question_table);
                            
                            
#                             for qtd in question_all_tds:
#                                 option_p_tags=qtd.find_all('p');
                                
#                                 regex = r"^[A-D\)$]+"
#                                 option_pattern = re.compile(regex)
#                                 #print(option_p_tags)
#                                 for option_tag in option_p_tags:
#                                     if option_pattern.match(option_tag.text):
#                                         #print(option_tag.text.strip())
#                                         option_index=option_tag.text.strip()
                                        
                                        
                                    
                                #option_pattern = re.compile(r'^\A-B+\)$')
                                #print(option_pattern.match(qtd))
                                #print(qtd)
                                #print(question_all_tds)
                            #options_pattern = re.compile(r'+[A-D]\) *')
                            #print(options_pattern.match(question_table))
                            #print(qsno,qno,question)
                            #print(qno,question_img)
                            #print(question_tr_tds[0])
                            #print(question_tr_tds[1])
            
        
    

# URL of the HTML content
url = 'https://vsa-digital.etutor.co/SR_SANKALP_QP_GT-513-01-2023ONLI/SR_SANKALP_QP_GT-513-01-2023ONLI.html'
quesObj=QuestionExtraction(url)
qresult=quesObj.questionList()
print(qresult)

#print(len(quesObj.questionList()))
#qids=[item['qno'] for item in qresult]
#print(qids)

# for q in qresult:
#     print(q)
#     print()

#dupes = [x for key in enumerate(qresult) if x in a[:n]]
#[q['qno'] for key,q in enumerate(qresult) if q['qno'] in [item['qno'] for item in qresult[:key]]]

# a = [[1], [2], [3], [1], [5], [3]]
# for key,q in enumerate(a):
#     print(key,a[:key])
    

[{'qno': 1, 'question': 'https://vsa-digital.etutor.co/SR_SANKALP_QP_GT-513-01-2023ONLI/lib/NewItem%20298.jpg', 'options': [{'option_index': 'A', 'option': 'https://vsa-digital.etutor.co/SR_SANKALP_QP_GT-513-01-2023ONLI/lib/NewItem%20297.png'}, {'option_index': 'B', 'option': 'https://vsa-digital.etutor.co/SR_SANKALP_QP_GT-513-01-2023ONLI/lib/NewItem%20296.png'}, {'option_index': 'C', 'option': 'https://vsa-digital.etutor.co/SR_SANKALP_QP_GT-513-01-2023ONLI/lib/NewItem%20295.png'}, {'option_index': 'D', 'option': 'https://vsa-digital.etutor.co/SR_SANKALP_QP_GT-513-01-2023ONLI/lib/NewItem%20294.jpg'}], 'qtype': 'M'}, {'qno': 2, 'question': 'https://vsa-digital.etutor.co/SR_SANKALP_QP_GT-513-01-2023ONLI/lib/NewItem%20293.jpg', 'options': [{'option_index': 'A', 'option': 'https://vsa-digital.etutor.co/SR_SANKALP_QP_GT-513-01-2023ONLI/lib/NewItem%20292.jpg'}, {'option_index': 'B', 'option': 'https://vsa-digital.etutor.co/SR_SANKALP_QP_GT-513-01-2023ONLI/lib/NewItem%20291.jpg'}, {'option_in