In [None]:
# exploratory_data_analysis.ipynb

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Load dataset
data = pd.read_csv('../data/raw_data.csv')

# Display the first few rows
data.head()

# Basic information about the dataset
data.info()

# Check for missing values
data.isnull().sum()

# Statistical summary of the dataset
data.describe()

# Distribution of the target variable (Stage)
plt.figure(figsize=(8, 6))
sns.countplot(x='Stage', data=data)
plt.title("Distribution of Cancer Stages")
plt.xlabel("Cancer Stage")
plt.ylabel("Count")
plt.show()

# Correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()

# Pairplot of selected features to see their distributions and relationships
selected_features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'Stage']
sns.pairplot(data[selected_features], hue="Stage", palette="Set1")
plt.show()

# Encoding categorical features if any (Label Encoding)
# Assuming 'Stage' is categorical
label_encoder = LabelEncoder()
data['Stage'] = label_encoder.fit_transform(data['Stage'])

# Check feature distributions
data.hist(bins=15, figsize=(15, 10))
plt.suptitle("Feature Distributions")
plt.show()
