# Import Required Libraries
Import the necessary libraries, including pandas, numpy, matplotlib, seaborn, and scikit-learn.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load the Dataset
Load the patient diagnosis dataset containing ICD-10 codes.

In [None]:
# Load the Dataset

# Load the patient diagnosis dataset containing ICD-10 codes
file_path = 'patient_icd10.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataset to verify loading
df.head()

# Data Preprocessing
Clean the dataset by handling missing values, encoding categorical variables, and normalizing numerical features.

In [None]:
# Data Preprocessing

# Handle missing values
df.fillna(method='ffill', inplace=True)  # Forward fill to handle missing values

# Encode categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Normalize numerical features
numerical_columns = df.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])

# Display the first few rows of the preprocessed dataset
df_encoded.head()

# Exploratory Data Analysis
Perform exploratory data analysis to understand the distribution of ICD-10 codes and other relevant features.

In [None]:
# Exploratory Data Analysis

# Distribution of ICD-10 codes
plt.figure(figsize=(12, 6))
sns.countplot(y='ICD-10 Code', data=df, order=df['ICD-10 Code'].value_counts().index)
plt.title('Distribution of ICD-10 Codes')
plt.xlabel('Count')
plt.ylabel('ICD-10 Code')
plt.show()

# Distribution of numerical features
df[numerical_columns].hist(bins=15, figsize=(15, 6), layout=(2, 4))
plt.suptitle('Distribution of Numerical Features')
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df_encoded.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Pairplot of numerical features
sns.pairplot(df[numerical_columns])
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()

# Summary statistics
summary_stats = df.describe()
summary_stats

# Data Visualization
Create visualizations to illustrate the patterns and trends in the diagnosis data using matplotlib and seaborn.

In [None]:
# Data Visualization

# Bar plot of the top 10 most frequent ICD-10 codes
top_10_icd10 = df['ICD-10 Code'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_10_icd10.values, y=top_10_icd10.index, palette='viridis')
plt.title('Top 10 Most Frequent ICD-10 Codes')
plt.xlabel('Count')
plt.ylabel('ICD-10 Code')
plt.show()

# Box plot of numerical features grouped by a specific ICD-10 code
specific_icd10 = 'A00'  # Replace with the ICD-10 code of interest
plt.figure(figsize=(12, 6))
sns.boxplot(x='ICD-10 Code', y='Age', data=df[df['ICD-10 Code'] == specific_icd10])
plt.title(f'Box Plot of Age for ICD-10 Code {specific_icd10}')
plt.xlabel('ICD-10 Code')
plt.ylabel('Age')
plt.show()

# Line plot of the number of diagnoses over time
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    diagnoses_over_time = df.groupby(df['Date'].dt.to_period('M')).size()
    plt.figure(figsize=(12, 6))
    diagnoses_over_time.plot(kind='line')
    plt.title('Number of Diagnoses Over Time')
    plt.xlabel('Date')
    plt.ylabel('Number of Diagnoses')
    plt.show()

# Heatmap of the frequency of ICD-10 codes by age group
age_bins = [0, 18, 35, 50, 65, 80, 100]
df['Age Group'] = pd.cut(df['Age'], bins=age_bins)
icd10_age_group = pd.crosstab(df['ICD-10 Code'], df['Age Group'])
plt.figure(figsize=(12, 8))
sns.heatmap(icd10_age_group, cmap='YlGnBu', annot=True, fmt='d')
plt.title('Frequency of ICD-10 Codes by Age Group')
plt.xlabel('Age Group')
plt.ylabel('ICD-10 Code')
plt.show()

# Scatter plot of two numerical features colored by ICD-10 code
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Feature1', y='Feature2', hue='ICD-10 Code', data=df, palette='tab10')
plt.title('Scatter Plot of Feature1 vs Feature2 Colored by ICD-10 Code')
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.legend(loc='best')
plt.show()

# Model Training and Evaluation
Train machine learning models to predict patient outcomes based on ICD-10 codes and evaluate their performance.

In [None]:
# Model Training and Evaluation

# Define the target variable and features
X = df_encoded.drop('Outcome', axis=1)  # Replace 'Outcome' with the actual target column name
y = df_encoded['Outcome']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict the outcomes on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model performance
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Print the classification report
print('Classification Report:')
print(class_report)