# Scrapping depuis :
- https://datahelpdesk.worldbank.org/knowledgebase/articles/898590-country-api-queries
-

In [None]:
import requests
import pandas as pd
import math

# Mapping des noms de pays à leurs codes ISO utilisés par l'API de la Banque mondiale
country_codes = {
    'Cuba': 'CUB',
    'Somaliland region': 'SOM',  # Note: Somaliland n'a peut-être pas de code ISO officiel
    'North Cyprus': 'CYP',  # Note: North Cyprus n'a pas de code ISO officiel
    'Somalia': 'SOM',
    'South Sudan': 'SSD',
    'Yemen': 'YEM',
    'Taiwan Province of China': 'TWN',  # Note: Utilisez TWN pour Taiwan
    'Venezuela': 'VEN',
    'Palestinian Territories': 'PSE',
    'Iran': 'IRN',
    'Hong Kong S.A.R. of China': 'HKG',
    'Kosovo': 'XKX',  # Note: Kosovo utilise XKX comme code ISO
    'Cyprus': 'CYP',
    'Malta': 'MLT'
}

# Fonction pour récupérer le PIB par habitant et calculer son logarithme
def get_log_gdp(country_name, year):
    if country_name not in country_codes:
        return None

    country_code = country_codes[country_name]
    url = f'http://api.worldbank.org/v2/country/{country_code}/indicator/NY.GDP.PCAP.CD?date={year}&format=json'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data and len(data) > 1 and data[1]:
            gdp_value = data[1][0].get('value', None)
            if gdp_value:
                return math.log(gdp_value)
    return None

# DataFrame avec les données initiales
data = {
    'Country name': [
        'Cuba', 'Somaliland region', 'Somaliland region', 'Somaliland region', 'North Cyprus',
        'Somaliland region', 'North Cyprus', 'North Cyprus', 'Somalia', 'South Sudan',
        'South Sudan', 'Somalia', 'North Cyprus', 'Somalia', 'South Sudan', 'North Cyprus',
        'South Sudan', 'Yemen', 'Taiwan Province of China', 'Venezuela', 'Palestinian Territories',
        'North Cyprus', 'Iran', 'Venezuela', 'Taiwan Province of China', 'Iran', 'North Cyprus',
        'Yemen', 'Palestinian Territories', 'Venezuela', 'Hong Kong S.A.R. of China', 'Iran',
        'Kosovo', 'Taiwan Province of China', 'Cyprus', 'Malta'
    ],
    'year': [
        2006, 2009, 2010, 2011, 2012, 2012, 2013, 2014, 2014, 2014, 2015, 2015, 2015, 2016, 2016,
        2016, 2017, 2018, 2018, 2018, 2018, 2018, 2018, 2019, 2019, 2019, 2019, 2019, 2019, 2020,
        2020, 2020, 2020, 2020, 2020,2020
    ],
    'Log GDP per capita': [None] * 36  # Ajustement de la longueur à 36
}


# Ajustement des longueurs pour correspondre
if len(data['Country name']) > len(data['year']):
    data['Country name'] = data['Country name'][:len(data['year'])]
elif len(data['year']) > len(data['Country name']):
    data['year'] = data['year'][:len(data['Country name'])]

if len(data['Country name']) > len(data['Log GDP per capita']):
    data['Country name'] = data['Country name'][:len(data['Log GDP per capita'])]
elif len(data['Log GDP per capita']) > len(data['Country name']):
    data['Log GDP per capita'] = data['Log GDP per capita'][:len(data['Country name'])]



# Création du DataFrame
df = pd.DataFrame(data)

# Compléter les données manquantes
for index, row in df.iterrows():
    if pd.isna(row['Log GDP per capita']):
        country = row['Country name']
        year = row['year']
        log_gdp = get_log_gdp(country, year)
        if log_gdp is not None:
            df.at[index, 'Log GDP per capita'] = log_gdp

print(df["Log GDP per capita"], ["Country name"],)


0      8.374909
1          None
2          None
3          None
4     10.271969
5          None
6     10.230181
7     10.209586
8      6.196831
9      7.127011
10     6.977074
11     6.229463
12    10.060852
13     6.248232
14         None
15    10.110719
16         None
17     6.553527
18         None
19         None
20      8.17817
21    10.286538
22     8.256024
23         None
24         None
25     8.094608
26     10.28943
27     6.542208
28      8.20436
29         None
30    10.738768
31     7.918053
32      8.36891
33         None
34    10.249961
35     10.29545
Name: Log GDP per capita, dtype: object ['Country name']


In [1]:
import scrapy

class CentreSpider(scrapy.Spider):
    name = "centre"
    start_urls = [
        'https://centrederechercheberbere.fr/accueil.html',
    ]

    def parse(self, response):
        for section in response.xpath('//section'):
            section_title = section.xpath('.//h1/text()').get(default='Unknown Section').strip()
            for item in section.xpath('.//div[@class="item"]'):
                title = item.xpath('.//h2/text()').get(default='').strip()
                date = item.xpath('.//span[@class="date"]/text()').get(default='').strip()
                content = item.xpath('.//p/text()').get(default='').strip()
                yield {
                    'Section': section_title,
                    'Title': title,
                    'Date': date,
                    'Content': content,
                }


ModuleNotFoundError: No module named 'scrapy'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Sélection des variables explicatives et de la variable cible
X = df.drop('Life Ladder', axis=1)
y = df['Life Ladder']

# Préparation des transformations pour les variables numériques et catégorielles
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine les transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)