# Data Mining Project 2 - Complete Analysis

This notebook contains a full end-to-end analysis for the `cmi_internet.csv` dataset:
- data loading and data quality checks
- exploratory analysis (missing values, class imbalance)
- preprocessing pipeline
- baseline classification models
- imbalance handling strategies
- final comparison and conclusions


In [None]:
# Core libraries
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
)

warnings.filterwarnings('ignore')
sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize'] = (10, 5)


## 1. Load Dataset

We use a relative path first, with a fallback to your absolute local path.


In [None]:
# Try project-relative path first
relative_path = Path('dm2_25_26_dataset_tabular/DM2_project/cmi_internet.csv')
absolute_fallback = Path('C:/Users/steve/Downloads/Data_Mining_project2/dm2_25_26_dataset_tabular/DM2_project/cmi_internet.csv')

if relative_path.exists():
    data_path = relative_path
elif absolute_fallback.exists():
    data_path = absolute_fallback
else:
    raise FileNotFoundError('Dataset file not found in expected locations.')

df = pd.read_csv(data_path)
df_original = df.copy()

print(f'Data path: {data_path}')
print(f'Shape: {df.shape}')
df.head()


## 2. Data Overview and Quality Checks


In [None]:
# General information about the dataset
display(df.info())
display(df.describe(include='all').T.head(10))

# Identify target and remove obvious identifier from modeling
target_col = 'sii'
id_col = 'id' if 'id' in df.columns else None

print(f'Target column: {target_col}')
print(f'ID column: {id_col}')


In [None]:
# Missing value analysis
missing_count = df.isna().sum()
missing_pct = (missing_count / len(df) * 100).sort_values(ascending=False)

print('Number of columns with at least one missing value:', (missing_count > 0).sum())
print()
print('Top 20 columns by missing percentage:')
display(missing_pct.head(20).to_frame('missing_%'))


In [None]:
# Target distribution (class imbalance check)
class_counts = df[target_col].value_counts().sort_index()
class_pct = (class_counts / class_counts.sum() * 100).round(2)

print('Target counts:')
print(class_counts)
print()
print('Target percentages (%):')
print(class_pct)

fig, ax = plt.subplots(1, 2, figsize=(14, 5))
sns.barplot(x=class_counts.index.astype(int), y=class_counts.values, ax=ax[0], palette='viridis')
ax[0].set_title('Target Class Counts (sii)')
ax[0].set_xlabel('Class')
ax[0].set_ylabel('Count')

ax[1].pie(class_counts.values, labels=class_counts.index.astype(int), autopct='%1.1f%%', startangle=90)
ax[1].set_title('Target Class Distribution (sii)')

plt.tight_layout()
plt.show()


In [None]:
# Visualize top missing-value columns
top_missing = missing_pct.head(15)

plt.figure(figsize=(12, 6))
sns.barplot(x=top_missing.values, y=top_missing.index, palette='magma')
plt.title('Top 15 Columns by Missing Percentage')
plt.xlabel('Missing %')
plt.ylabel('Column')
plt.tight_layout()
plt.show()


## 3. Train/Test Split and Preprocessing

We build a robust preprocessing pipeline:
- numeric features: median imputation + scaling
- categorical features: most frequent imputation + one-hot encoding


In [None]:
# Prepare features and target
X = df.drop(columns=[target_col] + ([id_col] if id_col else []), errors='ignore')
y = df[target_col].astype(int)

# Stratified split preserves class proportions in train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Separate numeric and categorical columns
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)
print('Numeric features:', len(numeric_features))
print('Categorical features:', len(categorical_features))


In [None]:
# Preprocessing blocks
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Evaluation helper

def evaluate_pipeline(model_name, estimator, X_tr, y_tr, X_te, y_te):
    pipe = Pipeline([
        ('preprocess', preprocessor),
        ('model', estimator)
    ])

    pipe.fit(X_tr, y_tr)
    y_pred = pipe.predict(X_te)

    metrics = {
        'model': model_name,
        'accuracy': accuracy_score(y_te, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_te, y_pred),
        'macro_f1': f1_score(y_te, y_pred, average='macro'),
        'weighted_f1': f1_score(y_te, y_pred, average='weighted')
    }

    return pipe, y_pred, metrics


## 4. Baseline Models

We compare simple and stronger baselines:
- `DummyClassifier` (reference baseline)
- `LogisticRegression`
- `DecisionTreeClassifier`
- `RandomForestClassifier`


In [None]:
baseline_models = {
    'dummy_prior': DummyClassifier(strategy='prior', random_state=42),
    'logistic_regression': LogisticRegression(max_iter=1000),
    'decision_tree': DecisionTreeClassifier(random_state=42, min_samples_leaf=10),
    'random_forest': RandomForestClassifier(n_estimators=250, random_state=42, n_jobs=-1)
}

baseline_results = []
fitted_baselines = {}
preds_baselines = {}

for name, model in baseline_models.items():
    fitted_pipe, y_pred, metrics = evaluate_pipeline(name, model, X_train, y_train, X_test, y_test)
    baseline_results.append(metrics)
    fitted_baselines[name] = fitted_pipe
    preds_baselines[name] = y_pred

baseline_df = pd.DataFrame(baseline_results).sort_values('macro_f1', ascending=False)
display(baseline_df)


In [None]:
# Detailed classification reports for baseline models
for name in baseline_df['model']:
    print('=' * 90)
    print(f'Classification report: {name}')
    print(classification_report(y_test, preds_baselines[name], digits=3))


In [None]:
# Confusion matrices for selected baseline models
selected_baselines = ['dummy_prior', 'decision_tree', 'random_forest']

fig, axes = plt.subplots(1, len(selected_baselines), figsize=(18, 5))
if len(selected_baselines) == 1:
    axes = [axes]

labels = sorted(y.unique())
for ax, model_name in zip(axes, selected_baselines):
    cm = confusion_matrix(y_test, preds_baselines[model_name], labels=labels)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=labels, yticklabels=labels)
    ax.set_title(model_name)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')

plt.tight_layout()
plt.show()


## 5. Imbalance Handling Strategies

Since `sii` is clearly imbalanced, we test:
1. algorithm-level balancing using `class_weight`
2. data-level random oversampling (training set only)
3. data-level random undersampling (training set only)


In [None]:
# Weighted models (algorithm-level balancing)
weighted_models = {
    'logreg_weighted': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'dtree_weighted': DecisionTreeClassifier(random_state=42, min_samples_leaf=10, class_weight='balanced'),
    'rf_weighted': RandomForestClassifier(
        n_estimators=250, random_state=42, n_jobs=-1, class_weight='balanced_subsample'
    )
}

weighted_results = []
fitted_weighted = {}
preds_weighted = {}

for name, model in weighted_models.items():
    fitted_pipe, y_pred, metrics = evaluate_pipeline(name, model, X_train, y_train, X_test, y_test)
    weighted_results.append(metrics)
    fitted_weighted[name] = fitted_pipe
    preds_weighted[name] = y_pred

weighted_df = pd.DataFrame(weighted_results).sort_values('macro_f1', ascending=False)
display(weighted_df)


In [None]:
# Manual random over/under-sampling helpers (no external package required)
def random_oversample(X_in, y_in, random_state=42):
    rng = np.random.default_rng(random_state)
    data = X_in.copy()
    data['_target_'] = y_in.values

    counts = data['_target_'].value_counts()
    max_count = counts.max()

    sampled_parts = []
    for cls, cls_count in counts.items():
        part = data[data['_target_'] == cls]
        if cls_count < max_count:
            # Sample with replacement to match majority class size
            extra_idx = rng.choice(part.index.to_numpy(), size=max_count - cls_count, replace=True)
            part = pd.concat([part, part.loc[extra_idx]], axis=0)
        sampled_parts.append(part)

    out = pd.concat(sampled_parts, axis=0).sample(frac=1.0, random_state=random_state)
    return out.drop(columns=['_target_']), out['_target_'].astype(int)


def random_undersample(X_in, y_in, random_state=42):
    data = X_in.copy()
    data['_target_'] = y_in.values

    counts = data['_target_'].value_counts()
    min_count = counts.min()

    sampled_parts = []
    for cls in counts.index:
        part = data[data['_target_'] == cls].sample(n=min_count, random_state=random_state, replace=False)
        sampled_parts.append(part)

    out = pd.concat(sampled_parts, axis=0).sample(frac=1.0, random_state=random_state)
    return out.drop(columns=['_target_']), out['_target_'].astype(int)


X_train_over, y_train_over = random_oversample(X_train, y_train)
X_train_under, y_train_under = random_undersample(X_train, y_train)

print('Original train distribution:')
print(y_train.value_counts().sort_index())
print()
print('Oversampled train distribution:')
print(y_train_over.value_counts().sort_index())
print()
print('Undersampled train distribution:')
print(y_train_under.value_counts().sort_index())


In [None]:
# Evaluate resampling with a stable model (Random Forest)
resampling_models = {
    'rf_oversampled': (RandomForestClassifier(n_estimators=250, random_state=42, n_jobs=-1), X_train_over, y_train_over),
    'rf_undersampled': (RandomForestClassifier(n_estimators=250, random_state=42, n_jobs=-1), X_train_under, y_train_under)
}

resample_results = []
fitted_resampled = {}
preds_resampled = {}

for name, (model, X_tr, y_tr) in resampling_models.items():
    fitted_pipe, y_pred, metrics = evaluate_pipeline(name, model, X_tr, y_tr, X_test, y_test)
    resample_results.append(metrics)
    fitted_resampled[name] = fitted_pipe
    preds_resampled[name] = y_pred

resample_df = pd.DataFrame(resample_results).sort_values('macro_f1', ascending=False)
display(resample_df)


## 6. Global Comparison of All Experiments


In [None]:
# Merge all experiment tables
all_results = pd.concat([
    baseline_df,
    weighted_df,
    resample_df
], ignore_index=True).sort_values('macro_f1', ascending=False)

display(all_results)

# Visual comparison on key imbalance-aware metrics
plot_df = all_results[['model', 'macro_f1', 'balanced_accuracy']].set_index('model')
plot_df.plot(kind='bar', figsize=(14, 6), colormap='tab20')
plt.title('Model Comparison on Imbalance-Aware Metrics')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Inspect confusion matrix of the best model by macro_f1
best_model_name = all_results.iloc[0]['model']

if best_model_name in preds_baselines:
    best_preds = preds_baselines[best_model_name]
elif best_model_name in preds_weighted:
    best_preds = preds_weighted[best_model_name]
else:
    best_preds = preds_resampled[best_model_name]

labels = sorted(y.unique())
cm = confusion_matrix(y_test, best_preds, labels=labels)

plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=labels, yticklabels=labels)
plt.title(f'Best Model Confusion Matrix: {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

print(f'Best model: {best_model_name}')
print(classification_report(y_test, best_preds, digits=3))


## 7. Feature Importance (Tree-Based Best Model)

If the best model is random-forest-based, we inspect the top transformed features.


In [None]:
if 'rf' in best_model_name:
    if best_model_name in fitted_baselines:
        best_pipe = fitted_baselines[best_model_name]
    elif best_model_name in fitted_weighted:
        best_pipe = fitted_weighted[best_model_name]
    else:
        best_pipe = fitted_resampled[best_model_name]

    preprocess_step = best_pipe.named_steps['preprocess']
    model_step = best_pipe.named_steps['model']

    feature_names = preprocess_step.get_feature_names_out()
    importances = model_step.feature_importances_

    fi = pd.DataFrame({'feature': feature_names, 'importance': importances})
    fi = fi.sort_values('importance', ascending=False).head(20)

    display(fi)

    plt.figure(figsize=(12, 6))
    sns.barplot(data=fi, x='importance', y='feature', palette='crest')
    plt.title('Top 20 Feature Importances (Best RF Model)')
    plt.tight_layout()
    plt.show()
else:
    print('Best model is not random-forest-based; feature importances are skipped.')


## 8. Final Comments and Conclusions

Main takeaways from this analysis:
- the target `sii` is strongly imbalanced (class 0 dominates)
- high missingness affects several feature groups, so robust imputation is necessary
- plain accuracy can be misleading on imbalanced data
- metrics like `macro_f1` and `balanced_accuracy` provide a better comparison
- random oversampling on train data can improve minority-class sensitivity compared to naive baselines

This notebook is fully reproducible and can be extended with:
- hyperparameter tuning (e.g., `GridSearchCV`)
- more advanced resampling methods (SMOTE/ADASYN) if `imbalanced-learn` is installed
- threshold tuning or cost-sensitive learning depending on project goals
