### About
#### Retrieve StackExchange data using the API and pre-process it with the Gemini LLM.

### Import necessary libraries and packages

In [127]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests
import webbrowser
from tqdm.notebook import tqdm_notebook, tqdm
from bs4 import BeautifulSoup
import time
import ast

### Get authentication code from redirect_uri

In [None]:

#stackapps 
client_id = "stackapp_clien_id"

redirect_uri = "https://stackexchange.com/oauth/login_success"
scope = "no_expiry"
auth_url = f"https://stackoverflow.com/oauth?client_id={client_id}&scope={scope}&redirect_uri={redirect_uri}"

import subprocess

auth_url = f"https://stackoverflow.com/oauth?client_id={client_id}&scope={scope}&redirect_uri={redirect_uri}"
#subprocess.run(["open", auth_url])  # On macOS
subprocess.run(["xdg-open", auth_url])  # On Linux
#subprocess.run(["start", auth_url], shell=True)  # On Windows




In [None]:

client_secret = "stackapp client_secret"
redirect_uri = "https://stackexchange.com/oauth/login_success"

#authentication code from redirect_uri
code =  #"authentication code"

# Exchange authorization code for access token
token_url = "https://stackoverflow.com/oauth/access_token"
data = {
    'client_id': client_id,
    'client_secret': client_secret,
    'code': code,
    'redirect_uri': redirect_uri,
}

response = requests.post(token_url, data=data)

# Parse the access token from the response
access_token = response.text.split('=')
print("Access Token:", access_token)


### Class to retrieve stackexchange data

In [98]:
class StackExchangeScrapper():
    def __init__(self,access_token: object, app_key: object):
        self.access_token = access_token
        self.app_key = app_key
        self.qna_df = ""
    

    def __get_total_question(self, tag, site):
        # Define the base URL for the StackExchange API
        url = "https://api.stackexchange.com/2.3/questions"


        params = {
            "tagged": tag,  # Tag for pandas-related questions
            "site": site,
            "filter": "total",  # Custom filter to include question and accepted answers
            'access_token': self.access_token,
            'key': self.app_key }
                            

        # Make the request
        response = requests.get( url, params=params)
        data = response.json()

        # Extract the total number of questions
        total_questions = data['total']

        # Output the result
        #print(f"Total number of questions tagged with 'python': {total_questions}")
        return total_questions
    
    def __get_question_answer(self, tag, site, page, pagesize):
        # Define the API URL for pandas questions
        url = "https://api.stackexchange.com/2.3/questions"

        # Parameters for the API call
        params = {
            "order": "desc",
            "sort": "votes",
            "tagged": tag,  # Tag for pandas-related questions
            "site": site,
            #"fromdate":"1451606400",
            "filter": "!-tsS42nh",  # Custom filter to exclude images
            "page":page,
            'pagesize': pagesize,            # Number of questions per request
            'access_token': self.access_token,
            'key': self.app_key
        }

        # Send a GET request to the API
        response = requests.get(url, params=params)
        questions_list = []
       # qus_id = []
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the JSON response
            data = response.json()

            # Create a list to store questions and their details
            questions_answer_list = []

            # Loop through each question in the response
            for question in data['items']:
                if question['is_answered']:
                    if 'accepted_answer_id' in question:
                        question_data= {}
                        for field in ['title', 'link', 'tags', 'question_id', 'is_answered', 'accepted_answer_id',
                                      'view_count', 'answer_count', 'score', 'last_activity_date', 'creation_date'
                                       ]:
                            #qus_id.append(question['question_id'])
                            question_data[f"q_{field}"] = f"{question[field]}"
                        question_data.update(self.__get_accepted_answer(tag, site, question['accepted_answer_id']))
                        questions_answer_list.append(question_data)


            # Convert the list of questions into a Pandas DataFrame for easy manipulation
            #self.qna_df = pd.DataFrame(questions_answer_list)

            return questions_answer_list
        else:
            print(f"Failed to fetch data: {response.status_code}")
            
    # Function to get the accepted answer for a specific question
    def __get_accepted_answer(self, tag, site, answer_id):
        url = f"https://api.stackexchange.com/2.3/answers/{answer_id}"
        params = {
            #"order": "desc",
            #"sort": "votes",
            "tagged": tag,
            "site": site , # Cross Validated (stats.stackexchange.com)
            "filter": "!9_bDE(fI5", #"!-tsS42nh" # Custom filter to include answer body and is_accepted field
            'access_token': self.access_token,
            'key': self.app_key
        }

        # Make the request to the Stack Exchange API
        response = requests.get(url, params=params)
        #return response

        if response.status_code == 200:
            # Parse the JSON response
            data = response.json()
            # Look for the accepted answer in the response
            answer_data = {}
            for answer in data['items']:
                for field in ['score',  'creation_date', 'body']:
                    if field == 'body':
                        answer_data[f"a_answer"] = self.__get_text_hyperlink(answer[field])
                    else:
                        answer_data[f"a_{field}"] = answer[field]


                return answer_data
            return None  # No accepted answer found
        else:
            print(f"Failed to fetch data: {response.status_code}")
            return None
    
    def __get_text_hyperlink(self, html_body):
    
        # Parse the HTML content
        soup = BeautifulSoup(html_body, 'html.parser')

        # Iterate over all anchor tags and replace them with "text (link)"
        for a in soup.find_all('a', href=True):
            a.replace_with(f"{a.get_text()} ({a['href']})")

        for img in soup.find_all('img'):
            img.decompose() 
         # Remove <a> tags that contain <img> tags
        for link in soup.find_all('a'):
            if link.find('img'):
                link.decompose()  # Removes the entire <a> tag if it contains an <img> tag


        # Get the resulting text including hyperlinks
        clean_text = soup.get_text()

        return clean_text
    
    def fetch_specified_results(self, tag, site, desired_count):
        total_count = self.__get_total_question(tag, site)
        if desired_count > total_count:
            print(f"Warning: Desired count ({desired_count}) exceeds available questions ({total_count}). Adjusting to {total_count}.")
            return None
        results = []
        page = 1
        pagesize = 10
        pbar = tqdm(total=desired_count,desc="Scrapping Data",initial=1)
        while len(results) < desired_count:
            # Fetch Q&A pairs from the current page
            qna_pairs = self.__get_question_answer(tag, site, page, pagesize)



            # Add the fetched results to the list
            results.extend(qna_pairs)
            
            # Remove duplicates based on 'q_question_id' by converting to a dict and back to a list
            results = list({question['q_question_id'] : question for question in results}.values())

            
            pbar.update( len(qna_pairs))
            page += 1  # Move to the next page
            

            
        #pbar.update(desired_count - pbar.n)
        pbar.set_postfix({"status": "Completed"})
        pbar.close()
        
        # Return exactly the desired number of results
        self.qna_df = pd.DataFrame(results[:desired_count])
        #return self.qna_df 
        
    
    

    

In [99]:
#generated access_token from oauth
access_token = "acces_token"

#stackapps
app_key = 'app_key'

### Create class instance

In [100]:
qna = StackExchangeScrapper(access_token,app_key)

### Retrieve 500 QnA

In [103]:
qna.fetch_specified_results('neural_networks', 'stats',500)

Scrapping Data:   0%|          | 1/500 [00:00<?, ?it/s]

### Let's check the duplication

In [104]:
qna.qna_df['q_question_id'].nunique()

500

#### There are no duplicate rows.

### Pre-processing retrieved data using google gemini llm

In [105]:
pip install -q -U google-generativeai

Note: you may need to restart the kernel to use updated packages.


In [107]:
import google.generativeai as genai
import os

genai.configure(api_key=secret_value)

### Build prompt to generate summary

In [108]:
def build_prompt(query):
    txt = """Read the paragraph below and provide a clear, concise summary. The summary should capture the main ideas in fewer than 350 words.

{info}

"""
                      
    
        
    txt = txt.format(info=query)
    #prompt = tokenizer.apply_chat_template(prompt_template, add_generation_prompt=True, return_dict=True, return_tensors="pt").to(model.device)

    return txt


### Define model

In [109]:
model = genai.GenerativeModel("models/gemini-1.0-pro")

### Sample summary

In [111]:
qna.qna_df['q_title'][20]

'What is global max pooling layer and what is its advantage over maxpooling layer?'

In [112]:
qna.qna_df['a_answer'][20]

'Global max pooling  =  ordinary max pooling layer with pool size equals to the size of the input (minus filter size + 1, to be precise). You can see that MaxPooling1D takes a pool_length argument, whereas GlobalMaxPooling1D does not.\nFor example, if the input of the max pooling layer  is $0,1,2,2,5,1,2$, global max pooling outputs $5$, whereas  ordinary max pooling layer with pool size equals to 3 outputs $2,2,5,5,5$ (assuming stride=1).\nThis can be seen in the code (https://github.com/fchollet/keras/blob/3d176e926f848c5aacd036d6095ab015a2f8cc83/keras/layers/pooling.py#L433):\nclass GlobalMaxPooling1D(_GlobalPooling1D):\n    """Global max pooling operation for temporal data.\n    # Input shape\n        3D tensor with shape: `(samples, steps, features)`.\n    # Output shape\n        2D tensor with shape: `(samples, features)`.\n    """\n\n    def call(self, x, mask=None):\n        return K.max(x, axis=1)\n\nIn some domains, such as natural language processing, it is common to use glo

In [113]:
print(model.generate_content(build_prompt(qna.qna_df['a_answer'][20])).text)

**Summary:**

Global max pooling is a variant of max pooling where the pooling operation is applied globally to the entire input, rather than locally to a subset of elements.

In contrast to ordinary max pooling, which uses a fixed window size to determine the maximum value, global max pooling extracts the maximum value from the entire input. This results in a significant reduction in dimensionality, as the output of global max pooling is a vector of the same size as the number of features in the input.

Global max pooling is commonly used in natural language processing, where it can effectively capture the most salient features from a sequence of words. In computer vision, however, non-global max pooling is typically preferred for tasks such as image classification, as it provides more localized information.

The implementation of global max pooling in Keras is straightforward, involving a single line of code that applies the maximum operation along a specific axis of the input tensor

### Get summary for answers

In [116]:
llm_answer_summary = []
for idx, answer in tqdm(enumerate(qna.qna_df['a_answer'])):
    response = model.generate_content(build_prompt(answer))
    llm_answer_summary.append(response.text)
    # Pause for 60 seconds after every 15 iterations
    if (idx + 1) % 15 == 0:
        print(f"Pausing for 60 seconds...|Remaining: {len(qna.qna_df['a_answer'])-len(llm_answer_summary)}")
        time.sleep(60);


0it [00:00, ?it/s]

Pausing for 60 seconds...|Remaining: 485
Pausing for 60 seconds...|Remaining: 470
Pausing for 60 seconds...|Remaining: 455
Pausing for 60 seconds...|Remaining: 440
Pausing for 60 seconds...|Remaining: 425
Pausing for 60 seconds...|Remaining: 410
Pausing for 60 seconds...|Remaining: 395
Pausing for 60 seconds...|Remaining: 380
Pausing for 60 seconds...|Remaining: 365
Pausing for 60 seconds...|Remaining: 350
Pausing for 60 seconds...|Remaining: 335
Pausing for 60 seconds...|Remaining: 320
Pausing for 60 seconds...|Remaining: 305
Pausing for 60 seconds...|Remaining: 290
Pausing for 60 seconds...|Remaining: 275
Pausing for 60 seconds...|Remaining: 260
Pausing for 60 seconds...|Remaining: 245
Pausing for 60 seconds...|Remaining: 230
Pausing for 60 seconds...|Remaining: 215
Pausing for 60 seconds...|Remaining: 200
Pausing for 60 seconds...|Remaining: 185
Pausing for 60 seconds...|Remaining: 170
Pausing for 60 seconds...|Remaining: 155
Pausing for 60 seconds...|Remaining: 140
Pausing for 60 s

In [117]:
qna.qna_df['llm_answer_summary'] = llm_answer_summary

### Actual answer

In [119]:
qna.qna_df.tail(1)['q_title'][499]

'On what tasks does neuroevolution outperform basic application of neural networks or genetic algorithms?'

In [120]:
qna.qna_df.tail(1)['a_answer'][499]

"This has been researched for 20 years or so, and there are many papers claiming to outperform backpropagation. Xin Yao did a lot of work on this in the 1990s, and Kenneth Stanley created one of the currently most active frameworks, NEAT (NeuroEvolution of Augmenting Topologies (see http://www.cs.ucf.edu/~kstanley/neat.html (http://www.cs.ucf.edu/~kstanley/neat.html) and http://tech.groups.yahoo.com/group/neat/ (http://tech.groups.yahoo.com/group/neat/)).\nThere's a lot of published material on different neuroevolutionary techniques, but these references may be useful in getting a feel for progress over the years:\n\nAzzini, A., Tettamanzi, A. (2008) 'Evolving Neural Networks for\nStatic Single-Position Automated Trading', Journal of Artiﬁcial\nEvolution and Applications, Volume 2008, Article ID 184286\nHintz, K.J., Spofford, J.J. (1990) 'Evolving a Neural Network',\nProceedings, 5th IEEE International Symposium on Intelligent\nControl, pp. 479-484\nMiller, G.F., Todd, P.M., Hedge, S.U

### LLM summary

In [121]:
qna.qna_df.tail(1)['llm_answer_summary'][499]

"**Summary:**\n\nNeuroevolution is a field of artificial intelligence that employs evolutionary algorithms to train neural networks. It has been studied for over two decades, with various techniques developed to surpass the popular backpropagation method.\n\nXin Yao and Kenneth Stanley have made significant contributions to the field in the 1990s, with Stanley's NEAT (NeuroEvolution of Augmenting Topologies) framework being a prominent tool today.\n\nExtensive research has been conducted on neuroevolution techniques, with published material available to trace its progress. Some key references include:\n\n* Azzini and Tettamanzi (2008): Evolving neural networks for automated trading\n* Hintz and Spofford (1990): Evolving a neural network\n* Miller et al. (1989): Using genetic algorithms to design neural networks\n* Montana (1995): Genetic algorithms for neural network weight selection\n* Yao (1993): Evolutionary artificial neural networks"

### combine tags with pipe operator

In [122]:
ast.literal_eval(qna.qna_df['q_tags'][0])

['model-selection', 'neural-networks']

In [123]:
qna.qna_df['q_tags'] = qna.qna_df['q_tags'].apply(lambda x: "|".join(i.strip() for i in ast.literal_eval(x)))

### Columns rename

In [124]:
qna.qna_df = qna.qna_df.rename(columns={'q_title':'question', 'q_tags':'tags', 'llm_answer_summary':'answer'})

In [125]:
qna.qna_df.head(1)

Unnamed: 0,question,q_link,tags,q_question_id,q_is_answered,q_accepted_answer_id,q_view_count,q_answer_count,q_score,q_last_activity_date,q_creation_date,a_score,a_creation_date,a_answer,answer
0,How to choose the number of hidden layers and ...,https://stats.stackexchange.com/questions/181/...,model-selection|neural-networks,181,True,1097,1145801,10,820,1661947755,1279584902,671,1280715630,"I realize this question has been answered, but...",**Network Configuration in Neural Networks**\n...


In [126]:
qna.qna_df.to_csv("Stackoverflow_data(neural_networks_stats)_pre_processed_Gemini_LLM.csv",index=False)