In [6]:
#Import Libraries

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt
import seaborn as sns

FILE_PATH = "clinvar_conflicting.csv"
df = pd.read_csv(FILE_PATH, low_memory=False)

print("--- Data Load ---")
print(f"shape: {df.shape}")
print("-" * 30)

# --- Summary Statistics ---

print("--- Summary Statistics ---")
print(df.select_dtypes(include=[np.number]).describe().T)
print("-" * 30)

# Target Column - clnsigincl (Likely_Benign, Likely_Pathogenic, Pathogenic, Benign, Uncertain)
target_candidates = [c for c in df.columns if c.strip().lower() == "clnsigincl"]

if len(target_candidates) == 0:
    # Error handling for missing target column
    raise ValueError("Column 'CLNSIGINCL' NOT FOUND. Check column names.")
else:
    target_col = target_candidates[0]
    print(f"--- Target Column Found ---")
    print(f"Target column (raw): {target_col}")
    print("-" * 30)

# Create 5 class target variable

def classify_label(value):
    if pd.isna(value):
        return "Uncertain"

    v = str(value).lower()

    if "likely_benign" in v:
        return "Likely_Benign"
    if "likely_pathogenic" in v:
        return "Likely_Pathogenic"
    if "benign" in v and "likely" not in v:
        return "Benign"
    if "pathogenic" in v and "likely" not in v:
        return "Pathogenic"
    if "uncertain" in v or "risk_factor" in v or "other" in v:
        return "Uncertain"

    return "Uncertain"

df["CLNSIG_5class"] = df[target_col].apply(classify_label)

print(f"--- 5-Class Target Created ---")
print("Target class distribution:")
print(df["CLNSIG_5class"].value_counts())
print("-" * 30)

--- Data Load ---
shape: (65188, 46)
------------------------------
--- Summary Statistics ---
                      count          mean           std         min  \
POS                 65188.0  7.757594e+07  5.974051e+07  961.000000   
AF_ESP              65188.0  1.451052e-02  5.779541e-02    0.000000   
AF_EXAC             65188.0  1.449218e-02  5.954210e-02    0.000000   
AF_TGP              65188.0  1.526350e-02  5.952741e-02    0.000000   
ORIGIN              65188.0  1.342486e+00  5.688772e+00    0.000000   
SSR                   130.0  2.269231e+00  4.190777e+00    1.000000   
CLASS               65188.0  2.521016e-01  4.342226e-01    0.000000   
DISTANCE              108.0  8.257315e+02  1.069363e+03    1.000000   
STRAND              65174.0 -6.659097e-03  9.999855e-01   -1.000000   
MOTIF_POS               2.0  1.000000e+00  0.000000e+00    1.000000   
MOTIF_SCORE_CHANGE      2.0 -8.000000e-02  2.404163e-02   -0.097000   
LoFtool             60975.0  3.450584e-01  3.612384e-