In [None]:
import pandas as pd
from pathlib import Path
import os

In [None]:
input_path = Path.home() / "hi-paris-2025/data/raw/X_test.csv"

print("Fichier X_test :", input_path)
print("Existe :", input_path.is_file())

if input_path.is_file():
    size_bytes = os.path.getsize(input_path)
    size_gb = size_bytes / (1024**3)
    print(f"Taille de X_test.csv : {size_gb:.2f} Go")
else:
    raise FileNotFoundError(f"Fichier introuvable : {input_path}")

In [None]:
chunksize = 10_000  

null_counts = None 
total_rows = 0      

for i, chunk in enumerate(pd.read_csv(input_path, chunksize=chunksize)):

    total_rows += len(chunk)
    chunk_nulls = chunk.isna().sum()
    
    if null_counts is None:
        null_counts = chunk_nulls
    else:
        null_counts = null_counts.add(chunk_nulls, fill_value=0)
    
    print(f"Chunk {i+1} traité, lignes dans ce chunk : {len(chunk)}")

print("Analyse terminée.")
print("Nombre total de lignes lues :", total_rows)


In [None]:
from IPython.display import display

results = null_counts.to_frame(name="null_count").reset_index()
results.rename(columns={"index": "column"}, inplace=True)

results["null_percentage"] = results["null_count"] / total_rows * 100

results_sorted = results.sort_values(by="null_count", ascending=False)

print("Top 10 des colonnes avec le plus de valeurs nulles :")
display(results_sorted.head(10))


In [None]:
output_stats_path = Path.home() / "hi-paris-2025/data/processed/X_test_null_stats.csv"
output_stats_path.parent.mkdir(parents=True, exist_ok=True)

results_sorted.to_csv(output_stats_path, index=False)
print("Statistiques de valeurs nulles sauvegardées dans :")
print(output_stats_path)

In [None]:
csv_path = Path.home() / "hi-paris-2025/data/processed/X_train_clean.csv"

df_header = pd.read_csv(csv_path, nrows=0)

print("Nombre de colonnes :", len(df_header.columns))
print("Noms des colonnes :")
print(list(df_header.columns))


In [None]:
csv_path = Path.home() / "hi-paris-2025/data/processed/X_train_clean.csv"  
chunk_size = 100_000             

row_count = 0

for chunk in pd.read_csv(csv_path, chunksize=chunk_size):
    row_count += len(chunk)

print(f"Nombre total de lignes : {row_count}")
