In [6]:
from stackapi import StackAPI
import pandas as pd
from datetime import datetime
import re

def initialize_api(site_name='stackoverflow'):
    """
    Initialise, retourne une instance de StackAPI pour le site donné.
    """
    return StackAPI(site_name)

def fetch_questions(api, **params):
    """
    Questions à partir de l'API Stack Exchange en utilisant les paramètres fournis.
    """
    return api.fetch('questions', **params)

def extract_question_data(questions):
    """
    Informations principales des questions et retourne une liste de dictionnaires.
    """
    data = []
    for item in questions['items']:
        question_data = {
            'date': datetime.fromtimestamp(item['creation_date']),
            'title': item['title'],
            'tags': ','.join(item['tags']),
            'score': item['score'],
            'body': item.get('body')
        }
        data.append(question_data)
    return data




In [7]:
def create_dataframe(data):
    # dataframe
    return pd.DataFrame(data)



In [8]:
def clean_text(text):
    if isinstance(text, str):
        # Supprime les balises HTML et leur contenu
        text = re.sub(r'<[^>]*>', '', text)  
        
        # Supprime les caractères spéciaux qui ne sont pas des lettres ou des espaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
    return text



In [10]:
def main():
    # Initialise l'API pour Stack Overflow
    SITE = initialize_api()

    # Paramètres de la requête
    params = {
        'tagged': 'python',
        'sort': 'votes',
        'min': 50,
        'pagesize': 50,
        'fromdate': int(datetime(2023, 1, 1).timestamp()),
        'todate': int(datetime(2023, 12, 31).timestamp())
    }

    # Récupére les questions
    questions = fetch_questions(SITE, **params)

    # Extrait les informations nécessaires
    data = extract_question_data(questions)

    # DataFrame
    df = create_dataframe(data)
    print(df.head())


    df['title'] = df['title'].apply(clean_text)
    df['body'] = df['body'].apply(clean_text)

    print(df)

    # premières lignes du DataFrame
    print(df.head())

if __name__ == "__main__":
    main()

                 date                                              title  \
0 2023-03-01 20:52:19  How do I solve &quot;error: externally-managed...   
1 2023-04-07 09:05:59  Error &quot;&#39;DataFrame&#39; object has no ...   
2 2023-10-02 05:02:00  Why did Flask start failing with &quot;ImportE...   
3 2023-10-26 08:22:24  AttributeError: module &#39;pkgutil&#39; has n...   
4 2023-03-31 13:58:04  OpenAI API error 429: &quot;You exceeded your ...   

                                              tags  score  body  
0                 python,error-handling,pip,debian    634  None  
1           python,pandas,dataframe,attributeerror    295  None  
2                     python,flask,pytest,werkzeug    186  None  
3                      python,python-3.x,numpy,pip    172  None  
4  python,prompt,openai-api,completion,chatgpt-api    159  None  
                  date                                              title  \
0  2023-03-01 20:52:19  How do I solve quoterror externallymanagedenvi