In [5]:
# import les fichiers dans Google Colab
from google.colab import files
import pandas as pd
import numpy as np
import io

print("Télécharger le fichier nba_data_raw.csv:")
uploaded = files.upload()

# Vérifie que le fichier a bien été téléchargé
for fn in uploaded.keys():
    print(f'Fichier "{fn}" téléchargé, {len(uploaded[fn])} octets')

# Télécharger le fichier prétraité en cas de besoin
print("\nTélécharger le fichier nba_shots_preprocessed_optimise.csv (optionnel):")
uploaded2 = files.upload()


Télécharger le fichier nba_data_raw.csv:


Saving nba_data_raw.csv to nba_data_raw (1).csv
Fichier "nba_data_raw (1).csv" téléchargé, 178135 octets

Télécharger le fichier nba_shots_preprocessed_optimise.csv (optionnel):


Saving nba_shots_preprocessed_optimise.csv to nba_shots_preprocessed_optimise (1).csv


In [2]:
%%writefile preprocessing.py
import pandas as pd
import numpy as np

def load_raw_data(filepath):
    """Charge les données NBA brutes depuis un fichier CSV"""
    return pd.read_csv(filepath)

def convert_time_to_seconds(df):
    """Convertit les minutes et secondes en secondes totales restantes"""
    df_copy = df.copy()
    df_copy['Total_Seconds_Remaining'] = df_copy['Minutes Remaining'] * 60 + df_copy['Seconds Remaining']
    return df_copy

def convert_distances_to_meters(df):
    """Convertit les distances de pieds en mètres (1 pied = 0.3048 mètre)"""
    df_copy = df.copy()
    df_copy['Shot_Distance_Meters'] = df_copy['Shot Distance'] * 0.3048
    return df_copy

def encode_shot_types(df):
    """Encode les types de tirs (2PT, 3PT) en valeurs numériques"""
    df_copy = df.copy()
    df_copy['Shot_Type_Encoded'] = df_copy['Shot Type'].apply(lambda x: 3 if '3PT' in x else 2)
    return df_copy

def combine_shot_zones(df):
    """Combine les zones de tir en catégories simplifiées"""
    df_copy = df.copy()

    # Création d'une colonne combinée
    df_copy['Shot_Zone_Combined'] = df_copy['Shot Zone Basic'] + '_' + df_copy['Shot Zone Area']

    # Mapping des zones simplifiées
    zone_mapping = {
        'Restricted Area_Center(C)': 1,  # Paint
        'In The Paint (Non-RA)_Center(C)': 2,  # In The Paint
        'Mid-Range_Left Side(L)': 3,  # Mid-Range
        'Mid-Range_Right Side(R)': 4,  # Mid-Range
        'Mid-Range_Center(C)': 4,  # Mid-Range
        'Mid-Range_Left Side Center(LC)': 3,  # Mid-Range
        'Mid-Range_Right Side Center(RC)': 4,  # Mid-Range
        'Above the Break 3_Center(C)': 5,  # Three
        'Above the Break 3_Left Side Center(LC)': 5,  # Three
        'Above the Break 3_Right Side Center(RC)': 5,  # Three
        'Left Corner 3_Left Side(L)': 6,  # Corner Three
        'Right Corner 3_Right Side(R)': 6,  # Corner Three
        'Backcourt_Back Court(BC)': 7    # Backcourt
    }

    # Application du mapping
    df_copy['Shot_Zone_Combined'] = df_copy['Shot_Zone_Combined'].map(zone_mapping).fillna(0)
    return df_copy

def extract_coordinates(df):
    """Extrait les coordonnées X et Y (déjà présentes dans les données)"""
    return df

def validate_processed_data(df):
    """Vérifie que les données prétraitées sont valides"""
    required_columns = [
        'X Location', 'Y Location', 'Total_Seconds_Remaining',
        'Shot_Type_Encoded', 'Shot_Distance_Meters', 'Shot_Zone_Combined',
        'Shot Made Flag'
    ]

    # Vérifie si toutes les colonnes requises existent
    for col in required_columns:
        if col not in df.columns:
            return False

    # Vérifie s'il n'y a pas de valeurs manquantes
    if df[required_columns].isnull().any().any():
        return False

    return True

def preprocess_data(raw_filepath, save_filepath=None):
    """Pipeline complet de prétraitement des données NBA"""
    df = load_raw_data(raw_filepath)
    df = convert_time_to_seconds(df)
    df = convert_distances_to_meters(df)
    df = encode_shot_types(df)
    df = combine_shot_zones(df)
    df = extract_coordinates(df)

    # Sélection des colonnes nécessaires
    processed_df = df[[
        'Player ID', 'X Location', 'Y Location', 'Total_Seconds_Remaining',
        'Shot_Type_Encoded', 'Shot_Distance_Meters', 'Shot_Zone_Combined',
        'Shot Made Flag'
    ]]

    # Validation des données prétraitées
    if not validate_processed_data(processed_df):
        raise ValueError("Les données prétraitées sont invalides")

    # Sauvegarde des données prétraitées si un chemin est fourni
    if save_filepath:
        processed_df.to_csv(save_filepath, index=False)

    return processed_df


Writing preprocessing.py


In [3]:
%%writefile test_preprocessing.py
import pytest
import pandas as pd
import numpy as np
from preprocessing import (
    load_raw_data,
    convert_time_to_seconds,
    convert_distances_to_meters,
    encode_shot_types,
    combine_shot_zones,
    extract_coordinates,
    validate_processed_data,
    preprocess_data
)

# Chemin vers les fichiers de données
RAW_DATA_PATH = 'nba_data_raw.csv'
PROCESSED_DATA_PATH = 'nba_shots_preprocessed_optimise.csv'

def test_load_raw_data():
    """Test de la fonction de chargement des données."""
    df = load_raw_data(RAW_DATA_PATH)
    assert isinstance(df, pd.DataFrame)
    assert len(df) > 0
    assert 'Shot Type' in df.columns
    assert 'Shot Distance' in df.columns

def test_convert_time_to_seconds():
    """Test de la conversion du temps en secondes."""
    sample_data = pd.DataFrame({
        'Minutes Remaining': [1, 2, 0],
        'Seconds Remaining': [30, 15, 45]
    })

    result = convert_time_to_seconds(sample_data)

    assert 'Total_Seconds_Remaining' in result.columns
    assert result['Total_Seconds_Remaining'].tolist() == [90, 135, 45]

def test_convert_distances_to_meters():
    """Test de la conversion des distances en mètres."""
    sample_data = pd.DataFrame({
        'Shot Distance': [10, 20, 30]
    })

    result = convert_distances_to_meters(sample_data)

    assert 'Shot_Distance_Meters' in result.columns
    expected = [10 * 0.3048, 20 * 0.3048, 30 * 0.3048]
    np.testing.assert_almost_equal(result['Shot_Distance_Meters'].tolist(), expected)

def test_encode_shot_types():
    """Test de l'encodage des types de tirs."""
    sample_data = pd.DataFrame({
        'Shot Type': ['2PT Field Goal', '3PT Field Goal', '2PT Field Goal']
    })

    result = encode_shot_types(sample_data)

    assert 'Shot_Type_Encoded' in result.columns
    assert result['Shot_Type_Encoded'].tolist() == [2, 3, 2]

def test_combine_shot_zones():
    """Test de la combinaison des zones de tir."""
    sample_data = pd.DataFrame({
        'Shot Zone Basic': ['Restricted Area', 'Mid-Range', 'Above the Break 3'],
        'Shot Zone Area': ['Center(C)', 'Left Side(L)', 'Right Side Center(RC)']
    })

    result = combine_shot_zones(sample_data)

    assert 'Shot_Zone_Combined' in result.columns
    assert result['Shot_Zone_Combined'].iloc[0] == 1  # Restricted Area, Center(C) -> 1
    assert result['Shot_Zone_Combined'].iloc[1] == 3  # Mid-Range, Left Side(L) -> 3
    assert result['Shot_Zone_Combined'].iloc[2] == 5  # Above the Break 3, Right Side Center(RC) -> 5

def test_extract_coordinates():
    """Test de l'extraction des coordonnées."""
    sample_data = pd.DataFrame({
        'X Location': [100, -100, 0],
        'Y Location': [200, -200, 0]
    })

    result = extract_coordinates(sample_data)

    assert 'X Location' in result.columns
    assert 'Y Location' in result.columns
    assert result['X Location'].tolist() == [100, -100, 0]
    assert result['Y Location'].tolist() == [200, -200, 0]

def test_validate_processed_data():
    """Test de la validation des données prétraitées."""
    valid_data = pd.DataFrame({
        'Player ID': [123, 456, 789],
        'X Location': [100, -100, 0],
        'Y Location': [200, -200, 0],
        'Total_Seconds_Remaining': [90, 135, 45],
        'Shot_Type_Encoded': [2, 3, 2],
        'Shot_Distance_Meters': [3.048, 6.096, 9.144],
        'Shot_Zone_Combined': [1, 3, 5],
        'Shot Made Flag': [1, 0, 1]
    })

    invalid_data = valid_data.drop(columns=['Shot_Type_Encoded'])

    assert validate_processed_data(valid_data) == True
    assert validate_processed_data(invalid_data) == False

def test_full_preprocessing_pipeline():
    """Test du pipeline complet de prétraitement."""
    processed_df = preprocess_data(RAW_DATA_PATH)

    required_columns = [
        'Player ID', 'X Location', 'Y Location', 'Total_Seconds_Remaining',
        'Shot_Type_Encoded', 'Shot_Distance_Meters', 'Shot_Zone_Combined',
        'Shot Made Flag'
    ]

    for col in required_columns:
        assert col in processed_df.columns

    assert validate_processed_data(processed_df) == True
    assert len(processed_df) > 0


Writing test_preprocessing.py


In [4]:
# Installation de pytest et pytest-cov (si nécessaire)
!pip install -q pytest pytest-cov

# Exécution des tests avec la couverture de code
!python -m pytest test_preprocessing.py -v --cov=preprocessing --cov-report=term


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.0/244.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
platform linux -- Python 3.11.12, pytest-8.3.5, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: cov-6.1.1, anyio-4.9.0, typeguard-4.4.2, langsmith-0.3.34
collected 8 items                                                              [0m

test_preprocessing.py::test_load_raw_data [32mPASSED[0m[32m                         [ 12%][0m
test_preprocessing.py::test_convert_time_to_seconds [32mPASSED[0m[32m               [ 25%][0m
test_preprocessing.py::test_convert_distances_to_meters [32mPASSED[0m[32m           [ 37%][0m
test_preprocessing.py::test_encode_shot_types [32mPASSED[0m[32m                     [ 50%][0m
test_preprocessing.py::test_combine_shot_zones [32mPASSED[0m[32m                    [ 62%][0m
te