In [None]:
import os
import glob
from pathlib import Path
import tempfile

# Set of common data file extensions to look for
DATA_EXTENSIONS = {'.csv', '.json', '.xlsx', '.xls', '.parquet', '.txt', '.tsv',
                   '.xml', '.yaml', '.yml', '.h5', '.hdf5', '.pkl', '.ndjson'}


def list_data_files(directory='.', recursive=False, include_hidden=False):
    """Return a sorted list of data files in `directory`.

    Parameters:
    - directory: path-like (str or Path). Defaults to current directory.
    - recursive: if True, search subdirectories recursively.
    - include_hidden: if False (default), skip files or path parts that start with a dot.

    Returns: list of string paths (absolute).
    """
    p = Path(directory)
    if not p.exists():
        raise FileNotFoundError(f"Directory not found: {directory}")
    files = []

    if recursive:
        # rglob through all files and filter by extension
        for path in p.rglob('*'):
            if not path.is_file():
                continue
            ext = path.suffix.lower()
            if ext not in DATA_EXTENSIONS:
                continue
            if not include_hidden:
                # skip if any path component is hidden
                if any(part.startswith('.') for part in path.parts):
                    continue
            files.append(str(path.resolve()))
    else:
        for path in p.iterdir():
            if not path.is_file():
                continue
            ext = path.suffix.lower()
            if ext not in DATA_EXTENSIONS:
                continue
            if not include_hidden and path.name.startswith('.'):
                continue
            files.append(str(path.resolve()))

    return sorted(files)


# Show data files in the current directory (non-recursive) when run
if __name__ == '__main__':
    cwd = Path('.').resolve()
    print(f"Data files in {cwd} (non-recursive):")
    found = list_data_files(cwd, recursive=False, include_hidden=False)
    if not found:
        print("  (none found)")
    else:
        for f in found:
            print("  ", f)


# ----------------
# Self-tests using a temporary directory to avoid touching user's files
# ----------------
with tempfile.TemporaryDirectory() as td:
    td_path = Path(td)
    # create some test files
    (td_path / 'a.csv').write_text('col1,col2\n1,2')
    (td_path / 'b.json').write_text('{"x":1}')
    (td_path / '.hidden.txt').write_text('secret')
    (td_path / 'c.jpg').write_text('not a data file')

    # subdirectory test for recursive
    sub = td_path / 'subdir'
    sub.mkdir()
    (sub / 'd.tsv').write_text('1\t2')

    # Non-recursive should find only top-level non-hidden data files
    nonrec = list_data_files(td_path, recursive=False, include_hidden=False)
    assert any(Path(p).name == 'a.csv' for p in nonrec), "a.csv should be found (non-recursive)"
    assert any(Path(p).name == 'b.json' for p in nonrec), "b.json should be found (non-recursive)"
    assert not any(Path(p).name == '.hidden.txt' for p in nonrec), "hidden file should be excluded"
    assert not any(Path(p).name == 'd.tsv' for p in nonrec), "subdir file should not be found in non-recursive mode"

    # Recursive should find files in subdirectories
    rec = list_data_files(td_path, recursive=True, include_hidden=False)
    assert any(Path(p).name == 'd.tsv' for p in rec), "d.tsv should be found in recursive mode"
    assert all(Path(p).suffix.lower() in DATA_EXTENSIONS for p in rec), "All returned files must have data extensions"

    # include_hidden=True should include hidden files
    inc_hidden = list_data_files(td_path, recursive=False, include_hidden=True)
    assert any(Path(p).name == '.hidden.txt' for p in inc_hidden), "Hidden file should be included when include_hidden=True"

    print("Self-tests passed.")


Data files in C:\Users\kakao\PycharmProjects\jupyter-llm\src (non-recursive):
   C:\Users\kakao\PycharmProjects\jupyter-llm\src\titanic.xls
Self-tests passed.

In [None]:
import pandas as pd
from pathlib import Path

# Try the expected local filename first, fallback to searching known data files
p = Path('titanic.xls')
if not p.exists():
    # use the helper list_data_files if available to find the file recursively
    try:
        candidates = [Path(f) for f in list_data_files('.', recursive=True, include_hidden=False)]
    except Exception:
        candidates = []
    matches = [c for c in candidates if c.name.lower().startswith('titanic') and c.suffix.lower() in ('.xls', '.xlsx')]
    if not matches:
        raise FileNotFoundError("titanic.xls not found in the current directory or subdirectories")
    p = matches[0]

# Read the Excel file into a DataFrame named df
df = pd.read_excel(p)

# Self-tests
assert isinstance(df, pd.DataFrame), "Loaded object is not a pandas DataFrame"
assert df.shape[0] > 0, "DataFrame appears to be empty"
print(f"Loaded '{p}' into df with shape {df.shape}")


Loaded 'titanic.xls' into df with shape (1309, 14)

In [None]:
import pandas as pd

# Quick inspection outputs
print("df.head():")
print(df.head())

print("\ndf.info():")
df.info()

print("\ndf.describe(include='all'):")
print(df.describe(include='all'))

print("\ndf.isnull().sum():")
print(df.isnull().sum())

# Self-tests
assert isinstance(df, pd.DataFrame), "df is not a pandas DataFrame"
assert df.shape[0] > 0, "DataFrame appears to be empty"
desc = df.describe(include='all')
assert isinstance(desc, pd.DataFrame), "describe(...) did not return a DataFrame"
nulls = df.isnull().sum()
assert isinstance(nulls, pd.Series), "isnull().sum() did not return a Series"
print("\nSelf-tests passed.")

df.head():
   pclass  survived  ...   body                        home.dest
0       1         1  ...    NaN                     St Louis, MO
1       1         1  ...    NaN  Montreal, PQ / Chesterville, ON
2       1         0  ...    NaN  Montreal, PQ / Chesterville, ON
3       1         0  ...  135.0  Montreal, PQ / Chesterville, ON
4       1         0  ...    NaN  Montreal, PQ / Chesterville, ON

[5 rows x 14 columns]

df.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      2

In [None]:
# Data cleaning & preprocessing for Titanic df
import pandas as pd
import numpy as np

# Work on a copy to avoid accidental side-effects
df_clean = df.copy()

# 1) Impute Embarked: fill missing with mode
if 'embarked' in df_clean.columns:
    if df_clean['embarked'].isnull().any():
        mode_emb = df_clean['embarked'].mode(dropna=True)
        if not mode_emb.empty:
            df_clean['embarked'] = df_clean['embarked'].fillna(mode_emb[0])
        else:
            # fallback to a placeholder if for some reason mode is empty
            df_clean['embarked'] = df_clean['embarked'].fillna('U')

# 2) Create FamilySize (as requested: SibSp + Parch) and a simple IsAlone flag
if {'sibsp', 'parch'}.issubset(df_clean.columns):
    df_clean['FamilySize'] = df_clean['sibsp'] + df_clean['parch']
    df_clean['IsAlone'] = (df_clean['FamilySize'] == 0).astype(int)
else:
    raise KeyError("Expected 'sibsp' and 'parch' columns to create FamilySize")

# 3) Impute Age using median age by (sex, pclass) — a common pragmatic approach
if 'age' in df_clean.columns:
    # compute group medians
    group_median = df_clean.groupby(['sex', 'pclass'])['age'].transform('median')
    df_clean['age'] = df_clean['age'].fillna(group_median)
    # if any ages still missing (group median could be NaN), fallback to overall median
    if df_clean['age'].isnull().any():
        overall_median = df_clean['age'].median()
        df_clean['age'] = df_clean['age'].fillna(overall_median)

# 4) Cabin handling: create CabinKnown flag and extract cabin deck (first letter) as a categorical feature
if 'cabin' in df_clean.columns:
    df_clean['CabinKnown'] = df_clean['cabin'].notna().astype(int)
    # extract deck letter; fillna with 'U' (Unknown) then map 'U' -> 'Unknown' label
    df_clean['CabinDeck'] = df_clean['cabin'].fillna('U').astype(str).str.strip().str[0]
    df_clean['CabinDeck'] = df_clean['CabinDeck'].replace({'U': 'Unknown'})
    # Optionally convert to dummies — keep them for modeling
    deck_dummies = pd.get_dummies(df_clean['CabinDeck'], prefix='Deck')
    df_clean = pd.concat([df_clean, deck_dummies], axis=1)
else:
    # If no cabin column, ensure CabinKnown exists for consistency
    df_clean['CabinKnown'] = 0

# 5) Convert categorical Sex to numeric (male = 1, female = 0). Drop original sex to avoid duplication
if 'sex' in df_clean.columns:
    df_clean['sex_male'] = df_clean['sex'].astype(str).str.lower().map({'male': 1, 'female': 0})
    # If there are any unexpected values, fill with 0 (treat as female/unknown -> 0)
    df_clean['sex_male'] = df_clean['sex_male'].fillna(0).astype(int)
    # drop original sex column
    df_clean = df_clean.drop(columns=['sex'])

# 6) Convert Embarked to numeric dummies and drop original
if 'embarked' in df_clean.columns:
    embarked_dummies = pd.get_dummies(df_clean['embarked'], prefix='Embarked')
    df_clean = pd.concat([df_clean, embarked_dummies], axis=1)
    df_clean = df_clean.drop(columns=['embarked'])

# 7) Drop irrelevant columns as requested: Ticket (and PassengerId if present). Keep name by default.
cols_to_drop = []
if 'ticket' in df_clean.columns:
    cols_to_drop.append('ticket')
if 'passengerid' in (c.lower() for c in df_clean.columns):
    # find exact column name (case-insensitive)
    pid_col = [c for c in df_clean.columns if c.lower() == 'passengerid'][0]
    cols_to_drop.append(pid_col)

if cols_to_drop:
    df_clean = df_clean.drop(columns=cols_to_drop)

# Optional: reorder columns to put newly created features near front (not required)
# ensure df variable is updated for downstream cells
df = df_clean

# ----------------
# Self-tests / quick checks
# ----------------
# 1) Age should have no missing values after imputation
assert 'age' in df.columns, "'age' column missing after processing"
assert df['age'].isnull().sum() == 0, f"There are still missing ages: {df['age'].isnull().sum()}"

# 2) Embarked should have been converted to dummy columns (or original filled if no conversion)
embarked_cols = [c for c in df.columns if c.startswith('Embarked_')]
assert len(embarked_cols) > 0 or 'embarked' not in df.columns, "Embarked not handled correctly"

# 3) sex_male created and is binary
assert 'sex_male' in df.columns, "sex_male column was not created"
assert set(df['sex_male'].unique()) <= {0, 1}, "sex_male contains values other than 0/1"

# 4) FamilySize exists and matches sibsp+parch
assert 'FamilySize' in df.columns, "FamilySize column missing"
assert (df['FamilySize'] == (df['sibsp'] + df['parch'])).all(), "FamilySize does not equal sibsp + parch"

# 5) Ticket should be dropped if it existed
assert 'ticket' not in df.columns, "ticket column was not dropped"

print('Cleaning complete. Resulting df shape:', df.shape)
print('\ndf.head():')
print(df.head())

# brief summary of nulls to confirm
print('\nRemaining null counts (top 10):')
print(df.isnull().sum().sort_values(ascending=False).head(10))


Cleaning complete. Resulting df shape: (1309, 28)

df.head():
   pclass  survived  ... Embarked_Q  Embarked_S
0       1         1  ...      False        True
1       1         1  ...      False        True
2       1         0  ...      False        True
3       1         0  ...      False        True
4       1         0  ...      False        True

[5 rows x 28 columns]

Remaining null counts (top 10):
body         1188
cabin        1014
boat          823
home.dest     564
fare            1
name            0
survived        0
pclass          0
age             0
parch           0
dtype: int64

In [None]:
# Baseline model: train/test split, RandomForest baseline, evaluation + cross-validation
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

# --- Preconditions / quick checks ---
assert 'survived' in df.columns, "Expected target column 'survived' in df"

# Select numeric and boolean feature columns automatically (exclude target)
feature_cols = df.select_dtypes(include=[np.number, 'bool']).columns.tolist()
feature_cols = [c for c in feature_cols if c != 'survived']

# If automatic selection yields too few features (e.g., only target), fall back to a small hand-picked set
if len(feature_cols) < 3:
    fallback = [c for c in ['pclass', 'age', 'fare', 'sex_male', 'FamilySize', 'IsAlone', 'CabinKnown'] if c in df.columns]
    if not fallback:
        raise RuntimeError('No reasonable numeric features found for modeling')
    feature_cols = fallback

# Prepare X and y
X = df[feature_cols].copy()
# convert any boolean columns to int so the model sees 0/1
bool_cols = X.select_dtypes(include='bool').columns.tolist()
if bool_cols:
    X[bool_cols] = X[bool_cols].astype(int)

y = df['survived']

# Train/test split (stratified by target)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Build preprocessing + classifier pipeline
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

clf = RandomForestClassifier(random_state=42, n_estimators=100)

pipe = Pipeline([
    ('preproc', preprocessor),
    ('clf', clf)
])

# Fit pipeline on training data
pipe.fit(X_train, y_train)

# Predictions on test set
y_pred = pipe.predict(X_test)

# Evaluation metrics on the test set
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
report = classification_report(y_test, y_pred, zero_division=0)

print(f"Baseline RandomForest (test set) — accuracy: {acc:.4f}, precision: {prec:.4f}, recall: {rec:.4f}")
print("Confusion matrix:\n", cm)
print("\nClassification report:\n", report)

# Cross-validation (5-fold stratified) to estimate baseline stability
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(
    pipe, X, y, cv=cv,
    scoring=['accuracy', 'precision', 'recall', 'f1'],
    return_train_score=False
)

# Summarize CV results
for metric in ['test_accuracy', 'test_precision', 'test_recall', 'test_f1']:
    vals = cv_results[metric]
    print(f"CV {metric}: mean={np.nanmean(vals):.4f}, std={np.nanstd(vals):.4f}, scores={vals}")

# ----------------
# Self-tests (simple checks to catch obvious issues)
# ----------------
# 1) Ensure training split is non-empty
assert X_train.shape[0] > 0, "X_train is empty"
assert y_train.shape[0] == X_train.shape[0], "Mismatch between X_train and y_train sizes"

# 2) Ensure preprocessing removed NaNs when applied to training data
X_train_transformed = preprocessor.fit_transform(X_train)
assert np.isfinite(X_train_transformed).all(), "Preprocessed X_train contains non-finite values"

# 3) Model produced predictions of expected length
assert y_pred.shape[0] == X_test.shape[0], "Prediction length does not match X_test"

# 4) Basic sanity on metrics
assert 0.0 <= acc <= 1.0, "Accuracy out of [0,1]"
assert 0.0 <= prec <= 1.0, "Precision out of [0,1]"
assert 0.0 <= rec <= 1.0, "Recall out of [0,1]"

print('\nSelf-tests passed. Baseline training & evaluation complete.')


Baseline RandomForest (test set) — accuracy: 0.7977, precision: 0.7282, recall: 0.7500
Confusion matrix:
 [[134  28]
 [ 25  75]]

Classification report:
               precision    recall  f1-score   support

           0       0.84      0.83      0.83       162
           1       0.73      0.75      0.74       100

    accuracy                           0.80       262
   macro avg       0.79      0.79      0.79       262
weighted avg       0.80      0.80      0.80       262

CV test_accuracy: mean=0.7861, std=0.0319, scores=[0.72519084 0.81679389 0.79007634 0.80534351 0.79310345]
CV test_precision: mean=0.7302, std=0.0487, scores=[0.6372549  0.76530612 0.73684211 0.73786408 0.77380952]
CV test_recall: mean=0.7020, std=0.0471, scores=[0.65 0.75 0.7  0.76 0.65]
CV test_f1: mean=0.7149, std=0.0403, scores=[0.64356436 0.75757576 0.71794872 0.74876847 0.70652174]

Self-tests passed. Baseline training & evaluation complete.