# Exploratory Data Analysis for NBA Dataset
## Chase, Jack, Timothy, Adam, Neel, Eddie, and Harrison

In [None]:
#2. Setting display options for better user viewing experience
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)
sns.set(style="whitegrid")




# 3 Defining feature groups

# Numeric features grouped here for convenience when running visualizations and summary statistics.
numeric_features = [
    "num_feature_1",
    "num_feature_2",
    "num_feature_3",
    "num_feature_4",
]

# Categorical features collected here to streamline value-count checks and categorical plots.
categorical_features = [
    "cat_feature_1",
    "cat_feature_2",
    "cat_feature_3",
]

# Datetime features included here so conversions and time-based analysis can run cleanly.
datetime_features = [
    "date_feature_1",
]

# Target feature set here separately to keep downstream modeling steps consistent.
target_col = "target_feature_1"





# 4 Basic dataset overview 

# Shape and column info
print("Shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())

print("\nData types:")
print(df.dtypes)

print("\nHead:")
print(df.head())

print("\nTail:")
print(df.tail())

print("\nInfo:")
print(df.info())

print("\nNumeric summary:")
print(df[numeric_features].describe().T)

print("\nCategorical summary (top levels and frequency):")
for col in categorical_features:
    if col in df.columns:
        print(f"\nColumn: {col}")
        print(df[col].value_counts(dropna=False).head(10))




# 5 Missing data analysis

# Count of missing per column
missing_counts = df.isna().sum().sort_values(ascending=False)
print("\nMissing values per column:")
print(missing_counts)

# Percentage missing per column
missing_pct = (df.isna().mean() * 100).sort_values(ascending=False)
print("\nMissing percentage per column:")
print(missing_pct)

# Visual missingness heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(df.isna(), cbar=False)
plt.title("Missingness Heatmap")
plt.tight_layout()
plt.show()




# 6 Univariate numeric distributions

for col in numeric_features:
    if col in df.columns:
        plt.figure(figsize=(10, 4))

        # Histogram and KDE
        plt.subplot(1, 2, 1)
        sns.histplot(df[col].dropna(), kde=True)
        plt.title(f"Histogram and KDE: {col}")

        # Boxplot
        plt.subplot(1, 2, 2)
        sns.boxplot(x=df[col])
        plt.title(f"Boxplot: {col}")

        plt.tight_layout()
        plt.show()




# 7 Univariate categorical distributions

for col in categorical_features:
    if col in df.columns:
        plt.figure(figsize=(8, 4))
        order = df[col].value_counts().index
        sns.countplot(data=df, x=col, order=order)
        plt.title(f"Countplot: {col}")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()




# 8 Target distribution 

if target_col in df.columns:
    if target_col in numeric_features:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[target_col].dropna(), kde=True)
        plt.title(f"Target Distribution (numeric): {target_col}")
        plt.tight_layout()
        plt.show()
    else:
        plt.figure(figsize=(6, 4))
        order = df[target_col].value_counts().index
        sns.countplot(data=df, x=target_col, order=order)
        plt.title(f"Target Distribution (categorical): {target_col}")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()




# 9 Numeric correlations

numeric_for_corr = [c for c in numeric_features if c in df.columns]

if len(numeric_for_corr) >= 2:
    corr = df[numeric_for_corr].corr()
    print("\nCorrelation matrix:")
    print(corr)

    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
    plt.title("Correlation Heatmap")
    plt.tight_layout()
    plt.show()




# 10 Pairwise relationships 
sample_df = df.sample(min(len(df), 1000), random_state=42)

if len(numeric_for_corr) >= 2:
    sns.pairplot(sample_df[numeric_for_corr + [target_col]] if target_col in sample_df.columns else sample_df[numeric_for_corr])
    plt.suptitle("Pairplot of Numeric Features", y=1.02)
    plt.show()




# 11 Relationship: numeric features vs target

if target_col in df.columns and target_col in numeric_features:
    for col in numeric_features:
        if col in df.columns and col != target_col:
            plt.figure(figsize=(6, 4))
            sns.scatterplot(data=df, x=col, y=target_col, alpha=0.6)
            plt.title(f"{target_col} vs {col}")
            plt.tight_layout()
            plt.show()

if target_col in df.columns and target_col not in numeric_features:
    # Boxplots and violin plots of numeric vs categorical target
    for col in numeric_features:
        if col in df.columns:
            plt.figure(figsize=(8, 4))
            sns.boxplot(data=df, x=target_col, y=col)
            plt.title(f"{col} by {target_col}")
            plt.xticks(rotation=45, ha="right")
            plt.tight_layout()
            plt.show()




# 12 Relationship: categorical features vs target

if target_col in df.columns and target_col in numeric_features:
    # Mean target by category
    for col in categorical_features:
        if col in df.columns:
            agg = df.groupby(col)[target_col].agg(["mean", "count"]).sort_values("mean", ascending=False)
            print(f"\nTarget summary by {col}:")
            print(agg.head(20))

            plt.figure(figsize=(10, 4))
            sns.barplot(x=agg.index, y=agg["mean"])
            plt.title(f"Mean {target_col} by {col}")
            plt.xticks(rotation=45, ha="right")
            plt.tight_layout()
            plt.show()

if target_col in df.columns and target_col not in numeric_features:
    # Cross-tabulation for categorical target
    for col in categorical_features:
        if col in df.columns and col != target_col:
            ctab = pd.crosstab(df[col], df[target_col], normalize="index")
            print(f"\nCrosstab (% within {col}) vs {target_col}:")
            print(ctab.head(20))

            plt.figure(figsize=(10, 6))
            sns.heatmap(ctab, annot=True, fmt=".2f", cmap="Blues")
            plt.title(f"{col} vs {target_col} (row %)")
            plt.tight_layout()
            plt.show()




# 13 Time series / datetime EDA

for col in datetime_features:
    if col in df.columns:
        # Plot of target over time if numeric
        if target_col in df.columns and target_col in numeric_features:
            ts_df = df[[col, target_col]].dropna().sort_values(col)
            plt.figure(figsize=(12, 4))
            plt.plot(ts_df[col], ts_df[target_col])
            plt.title(f"{target_col} over time ({col})")
            plt.xticks(rotation=45, ha="right")
            plt.tight_layout()
            plt.show()

        # Volume of records over time by month
        temp = df.set_index(col).resample("M").size()
        plt.figure(figsize=(10, 4))
        temp.plot()
        plt.title(f"Record count over time ({col})")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()




# 14 Numeric outlier detection using IQR

for col in numeric_features:
    if col in df.columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        outliers = df[(df[col] < lower) | (df[col] > upper)][col]
        print(f"\nOutliers in {col}: {len(outliers)} observations")
        print(f"Lower bound: {lower:.3f}, Upper bound: {upper:.3f}")




# 15 Feature-level summary table (EDA report-style)

summary_rows = []

for col in df.columns:
    col_data = df[col]
    row = {
        "feature": col,
        "dtype": col_data.dtype,
        "n_missing": col_data.isna().sum(),
        "pct_missing": col_data.isna().mean() * 100,
        "n_unique": col_data.nunique(dropna=True),
    }
    if col in numeric_features:
        row.update({
            "mean": col_data.mean(),
            "std": col_data.std(),
            "min": col_data.min(),
            "q1": col_data.quantile(0.25),
            "median": col_data.median(),
            "q3": col_data.quantile(0.75),
            "max": col_data.max(),
        })
    summary_rows.append(row)

summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df.sort_values("feature")
print("\nFeature summary table:")
print(summary_df)




# 16: Saving EDA summary to CSV
summary_df.to_csv("eda_feature_summary.csv", index=False)

In [2]:
# 1. Import statements and loading data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

df = pd.read_csv('DATA/nba_data_with_salaries.csv')


# 17: ydata_profile report for fun
profile = ProfileReport(df, title="EDA Report", explorative=True)
profile.to_notebook_iframe()

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'ipywidgets'