In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Data
train_data = pd.read_csv('training.csv')

# Display basic information about the dataset
print(train_data.info())

# Summary Statistics
print(train_data.describe())

# Check Missing Values
print(train_data.isnull().sum())


corr_df = train_data.drop(['patient_id'],axis=1).select_dtypes(exclude='object').corr()

# only looking at correlations showing a coefficient higher than 0.01
corr_df = corr_df[abs(corr_df)>0.005].dropna(how='any',axis=0)
corr_df = corr_df[abs(corr_df)>0.005].dropna(how='any',axis=1)

plt.figure(figsize=(10,5))
mask = np.triu(np.ones_like(corr_df, dtype=bool))
sns.heatmap(corr_df)

# Distribution of Target Variable
plt.figure(figsize=(8, 6))
sns.countplot(x='DiagPeriodL90D', data=train_data)
plt.title('Distribution of Diagnosis Period Less Than 90 Days')
plt.xlabel('Diagnosis Period Less Than 90 Days')
plt.ylabel('Count')
plt.show()

# Explore Categorical Variables
categorical_variables = ['patient_race', 'payer_type', 'patient_state', 'patient_gender']
plt.figure(figsize=(15, 8))
for i, variable in enumerate(categorical_variables, 1):
    plt.subplot(2, 2, i)
    sns.countplot(x=variable, data=train_data)
    plt.title(f'Distribution of {variable}')
    plt.xlabel(variable)
    plt.ylabel('Count')
plt.tight_layout()
plt.show()


# Correlation Analysis
correlation_matrix = train_data.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Explore Time-Related Variables
plt.figure(figsize=(12, 6))
sns.countplot(x='metastatic_first_novel_treatment_type', data=train_data)
plt.title('Distribution of First Novel Treatment Types')
plt.xlabel('First Novel Treatment Type')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Geographic Analysis
plt.figure(figsize=(18, 6))
sns.countplot(x='patient_state', data=train_data)
plt.title('Distribution of Patients Across States')
plt.xlabel('State')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()



# Feature Engineering (if needed)
# Outlier Detection and Handling
plt.figure(figsize=(10, 6))
sns.boxplot(x='DiagPeriodL90D', y='patient_age', data=train_data)
plt.title('Outliers in Patient Age by Diagnosis Period Less Than 90 Days')
plt.xlabel('Diagnosis Period Less Than 90 Days')
plt.ylabel('Patient Age')
plt.show()


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

train_data = pd.read_csv("training.csv")

# Drop unnecessary columns
train_data = train_data.drop(['patient_id'], axis=1)

# Handle missing values and encode categorical variables using one-hot encoding
train_data = pd.get_dummies(train_data, columns=['patient_race', 'payer_type', 'patient_state', 'patient_gender',
                                                 'breast_cancer_diagnosis_code', 'breast_cancer_diagnosis_desc',
                                                 'metastatic_cancer_diagnosis_code', 'metastatic_first_novel_treatment',
                                                 'metastatic_first_novel_treatment_type', 'Region', 'Division'])

# Split the data into features (X) and target variable (y)
X = train_data.drop(columns=["DiagPeriodL90D"])
y = train_data["DiagPeriodL90D"]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a classification model (XGBoost)
model = xgb.XGBClassifier(random_state=42)

# Train the XGBoost model
model.fit(X_train, y_train)

# Get feature importances
feature_importance = model.feature_importances_

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the top N important features
top_n_features = 10
plt.figure(figsize=(10, 5))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(top_n_features))
plt.title(f'Top {top_n_features} Important Features')
plt.show()

# Extract the top N important features
top_features = feature_importance_df.head(top_n_features)['Feature'].tolist()

# Select only the top features for correlation analysis
selected_features = X_train[top_features]

# Calculate correlations among the selected features
corr_df = selected_features.corr()

# Visualize the correlations using a heatmap
plt.figure(figsize=(10, 5))
mask = np.triu(np.ones_like(corr_df, dtype=bool))
sns.heatmap(corr_df, annot=True, fmt=".2f", cmap="coolwarm", mask=mask)
plt.title('Correlation Heatmap of Top Important Features')
plt.show()


# **Univariate Feature Selection**:

SelectKBest: Select the top k features based on univariate statistical tests.
SelectPercentile: Select the top features based on a percentage of the highest scores.


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import pandas as pd

# Load your data (replace 'your_dataset.csv' with your actual dataset)
train_data = pd.read_csv("training.csv")

# Drop the patient_id column
train_data = train_data.drop(['patient_id'], axis=1)

# Handle missing values
# Separate numerical and categorical features
numerical_features = train_data.select_dtypes(include=['float64']).columns
categorical_features = train_data.select_dtypes(include=['object']).columns

# Impute missing values for numerical features
numerical_imputer = SimpleImputer(strategy='mean')
train_data[numerical_features] = numerical_imputer.fit_transform(train_data[numerical_features])

# Impute missing values for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_features] = categorical_imputer.fit_transform(train_data[categorical_features])

# One-hot encode categorical variables
train_data = pd.get_dummies(train_data, columns=categorical_features)

# Split the data into features (X) and target variable (y)
X = train_data.drop(columns=["DiagPeriodL90D"])
y = train_data["DiagPeriodL90D"]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# SelectKBest with ANOVA F-statistic as the score function (for classification problems)
k = 10
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the indices of the selected features
selected_feature_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = X_train.columns[selected_feature_indices]

# Create a DataFrame with selected features
selected_features_df = pd.DataFrame(X_train_selected, columns=selected_feature_names)

# Print the names of selected features
print("Selected Features:")
print(selected_feature_names)


# **Recursive Feature Elimination (RFE)**:

RFE recursively removes the least important features based on a model's feature weights.


In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your data (replace 'your_dataset.csv' with your actual dataset)
train_data = pd.read_csv("training.csv")

# Drop the patient_id column
train_data = train_data.drop(['patient_id'], axis=1)

# Handle missing values
# Separate numerical and categorical features
numerical_features = train_data.select_dtypes(include=['float64']).columns
categorical_features = train_data.select_dtypes(include=['object']).columns

# Impute missing values for numerical features
numerical_imputer = SimpleImputer(strategy='mean')
train_data[numerical_features] = numerical_imputer.fit_transform(train_data[numerical_features])

# Impute missing values for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_features] = categorical_imputer.fit_transform(train_data[categorical_features])

# One-hot encode categorical variables
train_data = pd.get_dummies(train_data, columns=categorical_features)

# Split the data into features (X) and target variable (y)
X = train_data.drop(columns=["DiagPeriodL90D"])
y = train_data["DiagPeriodL90D"]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a base classifier (you can use any other classifier of your choice)
base_classifier = RandomForestClassifier()

# Create the RFE model and select 10 features
n_features_to_select = 10
rfe = RFE(estimator=base_classifier, n_features_to_select=n_features_to_select)
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Get the names of the selected features
selected_feature_names = X_train.columns[rfe.support_]

# Create a DataFrame with selected features
selected_features_df = pd.DataFrame(X_train_rfe, columns=selected_feature_names)

# Print the names of selected features
print("Selected Features:")
print(selected_feature_names)


LASSO Regression:

L1 regularization (LASSO) can be used to encourage sparsity in feature weights, effectively performing feature selection.


In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load your data (replace 'your_dataset.csv' with your actual dataset)
train_data = pd.read_csv("training.csv")

# Drop the patient_id column
train_data = train_data.drop(['patient_id'], axis=1)

# Handle missing values
# Separate numerical and categorical features
numerical_features = train_data.select_dtypes(include=['float64']).columns
categorical_features = train_data.select_dtypes(include=['object']).columns

# Impute missing values for numerical features
numerical_imputer = SimpleImputer(strategy='mean')
train_data[numerical_features] = numerical_imputer.fit_transform(train_data[numerical_features])

# Impute missing values for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_features] = categorical_imputer.fit_transform(train_data[categorical_features])

# One-hot encode categorical variables
train_data = pd.get_dummies(train_data, columns=categorical_features)

# Split the data into features (X) and target variable (y)
X = train_data.drop(columns=["DiagPeriodL90D"])
y = train_data["DiagPeriodL90D"]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for regularization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# Create the Lasso model
lasso = Lasso(alpha=0.04)  # Adjust the alpha parameter based on the desired level of regularization

# Fit the model to the training data
lasso.fit(X_train_scaled, y_train)

# Get the coefficients of the features
feature_coefficients = pd.Series(lasso.coef_, index=X.columns)

# Get the names of non-zero coefficient features (selected features)
selected_feature_names = feature_coefficients[feature_coefficients != 0].index

# Print the names of selected features
print("Selected Features:")
print(selected_feature_names)


# Tree-based Methods:

Decision trees and tree-based models can be analyzed for feature importance.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

# Load your data (replace 'your_dataset.csv' with your actual dataset)
train_data = pd.read_csv("training.csv")

# Drop the patient_id column
train_data = train_data.drop(['patient_id'], axis=1)

# Handle missing values
# Separate numerical and categorical features
numerical_features = train_data.select_dtypes(include=['float64']).columns
categorical_features = train_data.select_dtypes(include=['object']).columns

# Impute missing values for numerical features
numerical_imputer = SimpleImputer(strategy='mean')
train_data[numerical_features] = numerical_imputer.fit_transform(train_data[numerical_features])

# Impute missing values for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_features] = categorical_imputer.fit_transform(train_data[categorical_features])

# One-hot encode categorical variables
train_data = pd.get_dummies(train_data, columns=categorical_features)

# Split the data into features (X) and target variable (y)
X = train_data.drop(columns=["DiagPeriodL90D"])
y = train_data["DiagPeriodL90D"]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
rf_classifier.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Get the names of features
feature_names = X_train.columns

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(50, 60))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance Score')
plt.title('Feature Importances')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Data
df = pd.read_csv('training.csv')
bmi_column = df['bmi']

# 1. Missing Values
missing_values = bmi_column.isnull().sum()
print(f"Number of missing values in BMI column: {missing_values}")

# 2. Distribution
plt.figure(figsize=(10, 6))
sns.histplot(bmi_column.dropna(), bins=30, kde=True)
plt.title('BMI Distribution')
plt.show()

# 3. Summary Statistics
summary_stats = bmi_column.describe()
print("\nSummary Statistics:")
print(summary_stats)

# 4. Correlation with Other Features
correlation_matrix = df.corr()
bmi_correlation = correlation_matrix['bmi'].sort_values(ascending=False)
print("\nCorrelation with BMI:")
print(bmi_correlation)

# 5. BMI Categories
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal Weight'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

df['bmi_category'] = df['bmi'].apply(categorize_bmi)

df['bmi_category']

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd

# Load the Data
df = pd.read_csv('training.csv')

correlation_matrix = df.corr()

# Get the top 10 columns with the highest absolute correlation with 'bmi'
top_columns = correlation_matrix['bmi'].abs().sort_values(ascending=False).head(11).index[1:]

# Plot the heatmap for the top 10 columns
plt.figure(figsize=(15, 15))
sns.heatmap(correlation_matrix[top_columns].loc[top_columns], annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

print(df['bmi'].isnull().sum())

print("Top Col: ", top_columns)

# Assuming X contains features and y contains BMI
X = df[top_columns]
y = df['bmi']

# Identify rows with missing 'bmi' values
missing_bmi_mask = y.isnull()

# Separate the dataframe into features (X_train) and target (y_train) for non-missing values
X_train, y_train = X[~missing_bmi_mask], y[~missing_bmi_mask]

# Identify numerical columns
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# Create transformers for numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # You can choose another strategy
])

# Use ColumnTransformer to apply the numerical transformer to the numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
    ])

# Build the model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor())])

# Train the model on the rows where 'bmi' is not missing
model.fit(X_train, y_train)

# Predict missing 'bmi' values in the entire dataset
predicted_bmi = model.predict(X[missing_bmi_mask])

# Fill missing 'bmi' values with predictions
df.loc[missing_bmi_mask, 'bmi_filled'] = predicted_bmi

# Print the original 'bmi' and the new 'bmi_filled' columns
print(df[['bmi', 'bmi_filled']])

# Create a new column 'merged_bmi' with the merged output of 'bmi' and 'bmi_filled'
df['merged_bmi'] = df.apply(lambda row: row['bmi_filled'] if pd.notnull(row['bmi_filled']) else row['bmi'], axis=1)

df.to_csv('training_bmi.csv', index=False)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import pandas as pd
import matplotlib.pyplot as plt

# Load your data (replace 'your_dataset.csv' with your actual dataset)
train_data = pd.read_csv("training_Preprocessed.csv")

# Drop the patient_id column
train_data.drop(columns=['patient_id'], inplace=True)

# Separate features (X) and target variable (y)
X = train_data.drop(columns=["DiagPeriodL90D"])
y = train_data["DiagPeriodL90D"]

# Handle missing values
numerical_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Impute missing values for numerical features
numerical_imputer = SimpleImputer(strategy='mean')
X[numerical_features] = numerical_imputer.fit_transform(X[numerical_features])

# Impute missing values for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])

# One-hot encode categorical variables
X = pd.get_dummies(X, columns=categorical_features)

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
rf_classifier.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Get the names of features
feature_names = X_train.columns

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 106))  # Adjust figure size as needed
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance Score')
plt.title('Feature Importances')
plt.show()

In [None]:
# prompt: list the columns of dataset

print(X_train.columns.to_list())


In [None]:
# Assuming 'train_data' is your DataFrame
null_values = train_data.isnull().sum()
print("Null values in each column:")
print(null_values)


LASSO Regression:

L1 regularization (LASSO) can be used to encourage sparsity in feature weights, effectively performing feature selection.


In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load your data (replace 'your_dataset.csv' with your actual dataset)
train_data = pd.read_csv("cleaned_dataset_encoded.csv")

# Drop the patient_id column
train_data = train_data.drop(['patient_id'], axis=1)

# Handle missing values
# Separate numerical and categorical features
numerical_features = train_data.select_dtypes(include=['float64']).columns
categorical_features = train_data.select_dtypes(include=['object']).columns

# Impute missing values for numerical features
numerical_imputer = SimpleImputer(strategy='mean')
train_data[numerical_features] = numerical_imputer.fit_transform(train_data[numerical_features])

# Impute missing values for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_features] = categorical_imputer.fit_transform(train_data[categorical_features])

# One-hot encode categorical variables
train_data = pd.get_dummies(train_data, columns=categorical_features)

# Split the data into features (X) and target variable (y)
X = train_data.drop(columns=["DiagPeriodL90D"])
y = train_data["DiagPeriodL90D"]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for regularization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# Create the Lasso model
lasso = Lasso(alpha=0.04)  # Adjust the alpha parameter based on the desired level of regularization

# Fit the model to the training data
lasso.fit(X_train_scaled, y_train)

# Get the coefficients of the features
feature_coefficients = pd.Series(lasso.coef_, index=X.columns)

# Get the names of non-zero coefficient features (selected features)
selected_feature_names = feature_coefficients[feature_coefficients != 0].index

# Print the names of selected features
print("Selected Features:")
print(selected_feature_names)


In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your data (replace 'your_dataset.csv' with your actual dataset)
train_data = pd.read_csv("final_rounded.csv")
# train_data_rounded = train_data.round(2)

# Split the data into features (X) and target variable (y)
X = train_data.drop(columns=["DiagPeriodL90D"])
y = train_data["DiagPeriodL90D"]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a base classifier (you can use any other classifier of your choice)
base_classifier = RandomForestClassifier()

# Create the RFE model and select 10 features
n_features_to_select = 10
rfe = RFE(estimator=base_classifier, n_features_to_select=n_features_to_select)
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Get the names of the selected features
selected_feature_names = X_train.columns[rfe.support_]

# Create a DataFrame with selected features
selected_features_df = pd.DataFrame(X_train_rfe, columns=selected_feature_names)

# Print the names of selected features
print("Selected Features:")
print(selected_feature_names)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd

# Load the Data
df = pd.read_csv('cleaned_dataset_encoded.csv')

correlation_matrix = df.corr()

# Get the top 10 columns with the highest absolute correlation with 'bmi'
top_columns = correlation_matrix['DiagPeriodL90D'].abs().sort_values(ascending=False).head(11).index[1:]

# Plot the heatmap for the top 10 columns
plt.figure(figsize=(15, 15))
sns.heatmap(correlation_matrix[top_columns].loc[top_columns], annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

print(df['DiagPeriodL90D'].isnull().sum())

print("Top Col: ", top_columns)



In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/home/prital/Hackathon/cleaned_dataset.csv")

In [None]:
age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]  #bin edges
age_labels = ['0-10', '11-20', '21-30', '31-40', '41-50','51-60','61-70', '71-80','81-90','90+']  

df['age_group'] = pd.cut(df['patient_age'], bins=age_bins, labels=age_labels, right=False)

print(df[['patient_age', 'age_group']].head(20))

In [None]:
df.describe()

In [None]:
import pandas as pd


bmi_bins = [0, 18.5, 25, 30, float('inf')]  
bmi_labels = ['Underweight', 'Normal Weight', 'Overweight', 'Obese']  

df['bmi_category'] = pd.cut(df['bmi_filled'], bins=bmi_bins, labels=bmi_labels, right=False)

print(df[['bmi_filled', 'bmi_category']].head())


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

bmi_data = df[['bmi_filled']]

num_clusters = 3

kmeans = KMeans(n_clusters=num_clusters, random_state=42)

kmeans.fit(bmi_data)

cluster_centers = kmeans.cluster_centers_
cluster_labels = kmeans.labels_


df['bmi_cluster'] = cluster_labels

# Visualize
plt.scatter(df['bmi_filled'], [0] * len(df), c=df['bmi_cluster'], cmap='viridis')
plt.scatter(cluster_centers, [0] * num_clusters, marker='x', color='red', label='Cluster Centers')
plt.xlabel('BMI')
plt.ylabel('Cluster')
plt.title('K-Means Clustering of BMI')
plt.legend()
plt.show()

print(df[['bmi_filled', 'bmi_cluster']].head())


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

age_data = df[['patient_age']]

num_clusters = 3

kmeans = KMeans(n_clusters=num_clusters, random_state=42)

kmeans.fit(age_data)

cluster_centers = kmeans.cluster_centers_
cluster_labels = kmeans.labels_

df['age_cluster'] = cluster_labels

# Visualize 
plt.scatter(df['patient_age'], [0] * len(df), c=df['age_cluster'], cmap='viridis')
plt.scatter(cluster_centers, [0] * num_clusters, marker='x', color='red', label='Cluster Centers')
plt.xlabel('Patient Age')
plt.ylabel('Cluster')
plt.title('K-Means Clustering of Patient Age')
plt.legend()
plt.show()

print(df[['patient_age', 'age_cluster']].head())


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


income_data = df[['patient_income']]


num_clusters = 3

kmeans = KMeans(n_clusters=num_clusters, random_state=42)

kmeans.fit(income_data)

cluster_centers = kmeans.cluster_centers_
cluster_labels = kmeans.labels_

df['income_cluster'] = cluster_labels

# Visualize
plt.scatter(df['patient_income'], [0] * len(df), c=df['income_cluster'], cmap='viridis')
plt.scatter(cluster_centers, [0] * num_clusters, marker='x', color='red', label='Cluster Centers')
plt.xlabel('Patient Income')
plt.ylabel('Cluster')
plt.title('K-Means Clustering of Patient Income')
plt.legend()
plt.show()

print(df[['patient_income', 'income_cluster']].head())


In [None]:

cluster_diagnosis_df = df[['age_cluster', 'DiagPeriodL90D']]


In [None]:
cancer_cases_in_clusters = cluster_diagnosis_df.groupby(['age_cluster', 'DiagPeriodL90D']).size().unstack(fill_value=0)
print(cancer_cases_in_clusters)


In [None]:
cancer_cases_in_clusters.plot(kind='bar', stacked=True)
plt.title('Distribution of Cancer Cases in Clusters')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.show()


In [None]:
cluster_counts = cluster_diagnosis_df['age_cluster'].value_counts()
cancer_proportion_in_clusters = cancer_cases_in_clusters / cluster_counts[:, None]
print(cancer_proportion_in_clusters)


In [None]:
cancer_proportion_in_clusters.plot(kind='bar', stacked=True)
plt.title('Proportion of Cancer Cases in Clusters')
plt.xlabel('Cluster')
plt.ylabel('Proportion')
plt.show()


In [None]:
from scipy.stats import chi2_contingency

chi2, p, _, _ = chi2_contingency(cancer_cases_in_clusters)
print(f'Chi-Square Value: {chi2}, p-value: {p}')


In [None]:
# Keras

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tensorflow import keras
from tensorflow.keras import layers

df = pd.read_csv('final2.csv')

target_variable = 'DiagPeriodL90D'

train, test = train_test_split(df, test_size=0.2, random_state=42)

#  predictors
predictors = list(df.columns[df.columns != target_variable])

X_train = train[predictors].values
y_train = train[target_variable].values
X_test = test[predictors].values
y_test = test[target_variable].values

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Keras model
model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# predictions
predictions = model.predict(X_test)
predictions_binary = (predictions > 0.5).astype(int)

# Evaluation
accuracy = accuracy_score(y_test, predictions_binary)
print(f'Accuracy: {accuracy}')

# classification report
print(classification_report(y_test, predictions_binary))


In [None]:
import pandas as pd 

df = pd.read_csv("training//training.csv")
print(df.head)

In [None]:
# count of missing values in each column
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
import pandas as pd

df = pd.read_csv("training//final2.csv")

df_rounded = df.round(2)

# to check rounded values
df_rounded.to_csv("training//final_rounded.csv")
print(df_rounded)


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
# checking if there is any row with non malignant neoplasm

df = pd.read_csv("training//training_bmi_nlp.csv")

rows_without_malignant_neoplasm = ~df['processed_text'].str.contains('malignant neoplasm', case=False)

print(df[rows_without_malignant_neoplasm])

In [None]:
# NLP pipeline for description preprocessing

import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

df = pd.read_csv('training//training_bmi.csv')
dataf=df.copy()

# mapping dictionary for specific replacements
correction_dict = {
    'Malig': 'Malignant',
    'neoplm': 'neoplasm',
    'unsp': 'unspecified',
    'ovrlp': 'overlap'
}

# Text Cleaning
def clean_text(text):
    # Spell correction for whole words only
    words = text.split()
    corrected_words = [correction_dict.get(word, word) for word in words]
    text = ' '.join(corrected_words)
    
    # Remove unnecessary characters, symbols, or special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert the text to lowercase
    text = text.lower()
    return text

dataf['cleaned_text'] = dataf['breast_cancer_diagnosis_desc'].apply(clean_text)

# Tokenization
def tokenize_text(text):
    return word_tokenize(text)

dataf['tokenized_text'] = dataf['cleaned_text'].apply(tokenize_text)

# Stopword Removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

dataf['filtered_tokens'] = dataf['tokenized_text'].apply(remove_stopwords)

# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

dataf['lemmatized_tokens'] = dataf['filtered_tokens'].apply(lemmatize_tokens)
# Save the processed text in a new column
dataf['processed_text'] = dataf['lemmatized_tokens'].apply(lambda x: ' '.join(x))

# Define the words to remove
words_to_remove = ['malignant', 'neoplasm', 'site', 'breast', 'female']

# Create a new column with the specified words removed
dataf['imp_desc'] = dataf['processed_text'].replace('|'.join(words_to_remove), '', regex=True).replace('\s+', ' ', regex=True)

# Remove rows with empty strings in 'imp_desc'
dataf = dataf[dataf['imp_desc'].str.strip() != '']

# Function to remove repeated words in a block and maintain order
def remove_repeated_words(text):
    words = text.split()
    unique_words = []
    
    for word in words:
        if word not in unique_words:
            unique_words.append(word)
    
    return ' '.join(unique_words)

# Apply the function to the 'imp_desc' column
dataf['affected_site'] = dataf['imp_desc'].apply(remove_repeated_words)
df['processed_text'] = dataf['processed_text']
df['affected_site'] = dataf['affected_site'] 
# DataFrame with the new column
print(dataf[['breast_cancer_diagnosis_desc', 'affected_site']])
df.to_csv('training//training_bmi_nlp.csv', index=False)

In [None]:
"""The features that have low p-values in the ANOVA test are the ones that show significant 
differences across different categories of the patient_race column. These features are likely to be informative 
for predicting or imputing values in the patient_race column
"""


import pandas as pd
from scipy.stats import f_oneway

df = pd.read_csv('training//training_bmi.csv')
numerical_columns = [
    'patient_zip3', 'patient_age', 'population', 'density', 'age_median', 'age_under_10',
    'age_10_to_19', 'age_20s', 'age_30s', 'age_40s', 'age_50s', 'age_60s', 'age_70s', 'age_over_80',
    'male', 'female', 'married', 'divorced', 'never_married', 'widowed', 'family_size',
    'family_dual_income', 'income_household_median', 'income_household_under_5',
    'income_household_5_to_10', 'income_household_10_to_15', 'income_household_15_to_20',
    'income_household_20_to_25', 'income_household_25_to_35', 'income_household_35_to_50',
    'income_household_50_to_75', 'income_household_75_to_100', 'income_household_100_to_150',
    'income_household_150_over', 'income_household_six_figure', 'income_individual_median',
    'home_ownership', 'housing_units', 'home_value', 'rent_median', 'rent_burden',
    'education_less_highschool', 'education_highschool', 'education_some_college',
    'education_bachelors', 'education_graduate', 'education_college_or_above',
    'education_stem_degree', 'labor_force_participation', 'unemployment_rate',
    'self_employed', 'farmer', 'race_white', 'race_black', 'race_asian', 'race_native',
    'race_pacific', 'race_other', 'race_multiple', 'hispanic'
]

# DataFrame to store the results
anova_results = pd.DataFrame(columns=['Feature', 'F-statistic', 'P-value'])

for feature in numerical_columns:
    # Filtering out NaN values
    non_nan_data = df[[feature, 'patient_race']].dropna()
    
    # ANOVA test
    group_by_race = non_nan_data.groupby('patient_race')[feature]
    anova_result = f_oneway(*[group.values for name, group in group_by_race])
    
    # results
    anova_results = anova_results.append({
        'Feature': feature,
        'F-statistic': anova_result.statistic,
        'P-value': anova_result.pvalue
    }, ignore_index=True)

print(anova_results.sort_values(by='P-value'))



In [None]:
# Patient_race null value filling 

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


df = pd.read_csv("training//training.csv")

# Feature Selection
numerical_features = [
    'race_white', 'race_black', 'race_asian', 'race_native',
    'race_pacific', 'race_other', 'race_multiple', 'hispanic', 'patient_zip3'
]

categorical_feature = 'patient_race'


#DataFrame with rows containing null 'patient_race'
df_predict = df[df[categorical_feature].isnull()]

# Droping rows where the target variable has null values
training_data = df.dropna(subset=[categorical_feature] + numerical_features)

if not df_predict.empty:
    X = training_data[numerical_features]
    y = training_data[categorical_feature]

    model = RandomForestClassifier(random_state=42)

    model.fit(X, y)
    
    # Code to find accuracy
    # X_train, X_test, y_train, y_test = train_test_split(training_data[numerical_features], training_data[categorical_feature], test_size=0.2, shuffle=True)
    # model = RandomForestClassifier(random_state=42)
    # model.fit(X_train, y_train)
    # y_pred = model.predict(X_test)
    # accuracy = accuracy_score(y_test, y_pred)
    # print(f"Accuracy: {accuracy}")


# Prediction
    print("Before Prediction:")
    print("Total nulls: ", df['patient_race'].isnull().sum())
    print("count")
    print(df['patient_race'].value_counts())

    # 'patient_race_filled' has the predicted values
    df['patient_race_filled'] = df['patient_race']

    # Replacing null values in 'patient_race_filled' with the predicted values
    df.loc[df_predict.index, 'patient_race_filled'] = model.predict(df_predict[numerical_features])

    print("\nAfter Prediction:")
    print("Total nulls: ", df['patient_race_filled'].isnull().sum())
    print("count")
    print(df['patient_race_filled'].value_counts())


    df.to_csv("training//training_race_filled.csv")


In [None]:
from sklearn.preprocessing import LabelEncoder


import pandas as pd

data = pd.read_csv("training//cleaned_dataset.csv")

# column for one-hot encoding
df = data[['patient_gender']]

# Performing one-hot encoding
df_encoded = pd.get_dummies(df, columns=['patient_gender'], prefix=['patient'])

# Merging the data
data_encoded = pd.concat([data, df_encoded], axis=1)

print(data_encoded)

# Save
data_encoded.to_csv("training//cleaned_dataset_encoded.csv", index=False)


In [None]:
import pandas as pd


data = pd.read_csv("training//cleaned_dataset_encoded.csv")

# column for one-hot encoding
df = data[['patient_race_filled']]

# Performing one-hot encoding
df_encoded = pd.get_dummies(df, columns=['patient_race_filled'], prefix=['race'])

# Merging data
data_encoded = pd.concat([data, df_encoded], axis=1)

print(data_encoded)

# Save
data_encoded.to_csv("training//cleaned_dataset_encoded.csv", index=False)


In [None]:
import pandas as pd

data = pd.read_csv("training//cleaned_dataset_encoded.csv")

#column for one-hot encoding
df = data[['payer_type']]

# Performing one-hot encoding
df_encoded = pd.get_dummies(df, columns=['payer_type'], prefix=['payer'])

# Merging data
data_encoded = pd.concat([data, df_encoded], axis=1)

print(data_encoded)

# Save
data_encoded.to_csv("training//cleaned_dataset_encoded.csv", index=False)

In [None]:
# Trying TFIDF vectorizer for affected_site column encoding but not used for final data

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv("training//cleaned_dataset.csv")
df = data['affected_site']
data['affected_site'].fillna('unspecified', inplace=True)

print(df)

tfidf_vectorizer = TfidfVectorizer()

df_encoded = tfidf_vectorizer.fit_transform(df)
print(df_encoded)


In [None]:
# Frequency encoding diagnosis codes

data = pd.read_csv("training//final.csv")

# Filling missing values
data['breast_cancer_diagnosis_code'].fillna('0', inplace=True)

# column for frequency encoding
df = data['breast_cancer_diagnosis_code']


frequency_encoding = df.value_counts(normalize=True).to_dict()

df_encoded = df.map(frequency_encoding)

data['breast_cancer_diagnosis_code_encoded'] = df_encoded

data.to_csv("training//final1.csv", index=False)
print(data)


In [None]:
# Frequency encoding diagnosis codes

data = pd.read_csv("training//final1.csv")

# Fill missing values
data['metastatic_cancer_diagnosis_code'].fillna('0', inplace=True)

# column for frequency encoding
df = data['metastatic_cancer_diagnosis_code']

frequency_encoding = df.value_counts(normalize=True).to_dict()

df_encoded = df.map(frequency_encoding)

data['metastatic_cancer_diagnosis_code_encoded'] = df_encoded

data.to_csv("training//final2.csv", index=False)
print(data)


In [None]:
# Feature Selection

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

train_data = pd.read_csv("training//final_rounded.csv")


# Split the data into features (X) and target variable (y)
X = train_data.drop(columns=["DiagPeriodL90D"])
y = train_data["DiagPeriodL90D"]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a base classifier
base_classifier = RandomForestClassifier()

# Create the RFE model and select 10 features
n_features_to_select = 10
rfe = RFE(estimator=base_classifier, n_features_to_select=n_features_to_select)
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Get the names of the selected features
selected_feature_names = X_train.columns[rfe.support_]

# Create a DataFrame with selected features
selected_features_df = pd.DataFrame(X_train_rfe, columns=selected_feature_names)

# Print the names of selected features
print("Selected Features:")
print(selected_feature_names)