# Packages

In [None]:
# Import necessary libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

# Import Chi-squared contingency test for statistical analysis
from scipy.stats import chi2_contingency

# Preprocessing
from sklearn.preprocessing import MinMaxScaler


# Import XGBoost for machine learning
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB


from sklearn.model_selection import GridSearchCV  # Import GridSearchCV


# Model Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score 



# Import os for directory and file operations
import os
# Loop through files and directories in the specified path
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load Data

In [None]:
# Load datasets
train_df = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv')

In [None]:
# Display settings
pd.options.display.max_columns = 29
pd.options.display.max_rows = 29

# Data Analysis

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
# Display brief descriptive statistics for the 'outcome' column
print(train_df['outcome'].describe())

In [None]:
# Plot target distribution
target_dist = train_df['outcome'].value_counts()

plt.pie(target_dist, shadow=True, explode=[.05,.05, .05], autopct='%.1f%%')

plt.title('Target distribution', size=18)
plt.legend(target_dist.index, loc='upper left', bbox_to_anchor=(1, 1), fontsize=12)

plt.figure(figsize=(5,10))

plt.show()

In [None]:
# Plot histograms of numeric columns (training set)
cols_to_display = train_df.columns[1:]
train_df[cols_to_display].hist(
    figsize=(15, 10), color="blue", edgecolor="black")
plt.suptitle("Numeric columns (train)", fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# Plot histograms of numeric columns (test set)
cols_to_display = test_df.columns[1:]
test_df[cols_to_display].hist(
    figsize=(15, 10), color="blue", edgecolor="black")
plt.suptitle("Numeric columns (test)", fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# distribution of categorical data within the specified categorical columns of the two datasets: 
# the training dataset and the test dataset
sns.set_palette('rainbow')

# Select categorical columns from the 'train_df' DataFrame
num = train_df.drop(columns='outcome').select_dtypes(include=['object']).columns

# Concatenate the categorical columns from 'train_df' and 'test_df' with a 'Source' column to distinguish them
df = pd.concat([train_df[num].assign(Source = 'train'), 
                test_df[num].assign(Source = 'test')], 
               axis=0, ignore_index = True)

# Create subplots with a grid of rows and columns based on the number of categorical columns
fig, axes = plt.subplots(len(num), 2 ,figsize = (12, len(num) * 4.2))

# Loop through the categorical columns
for i,col in enumerate(num):
    train_dist = df.loc[df.Source == 'train', [col]].value_counts()
    test_dist = df.loc[df.Source == 'test', [col]].value_counts()
 
    # Plot a pie chart for the 'train' data in the left subplot
    ax = axes[i,0]
    ax.pie(train_dist, shadow=True, explode=[.05]*len(train_dist), autopct='%.1f%%')
    ax.legend([category[0] for category in train_dist.index], loc='upper left', bbox_to_anchor=(1, 1), fontsize=12)
    ax.set(xlabel = '', ylabel = '')
    ax.set_title(f'Train {col}',fontsize = 9)

    # Plot a pie chart for the 'test' data in the right subplot
    ax = axes[i,1]
    ax.pie(test_dist, shadow=True, explode=[.05]*len(test_dist), autopct='%.1f%%')
    ax.legend([category[0] for category in test_dist.index], loc='upper left', bbox_to_anchor=(1, 1), fontsize=12)
    ax.set(xlabel = '', ylabel = '')
    ax.set_title(f'Test {col}',fontsize = 9)


plt.suptitle(f"\nDistribution analysis - categorical features\n",fontsize = 15, y = 0.9, x = 0.57)
plt.show()

In [None]:
# Calculate the correlation matrix for numeric columns in the 'train_df' DataFrame
corr_matrix = train_df.select_dtypes(include=np.number).corr()
mask = np.triu(corr_matrix)

plt.figure(figsize=(15,12))
sns.heatmap(data=corr_matrix, mask=mask, cmap='Blues', linewidths=1, square=True, linecolor='#fafafa')
plt.title('\nCorrelation matrix\n', fontsize=17)
plt.show()

In [None]:
numerical_cols = train_df.select_dtypes(include=np.number)
corr_table = numerical_cols.corr()
corr_table

In [None]:
# Perform the Chi-Square test for independence on categorical columns in 'train_df' to determine
# their significance in relation to the 'outcome' variable.

# Define a list of the categorical columns
categorical_cols = train_df.select_dtypes("object").columns

# Set the significance threshold
threshold = 0.05

# Create an empty list to store the results
results = []

for column in categorical_cols:
    # Create a contingency table
    contingency_table = pd.crosstab(train_df[column], train_df['outcome'])
    
    # Perform the Chi-Square test
    chi2, p, _, _ = chi2_contingency(contingency_table)
    
    # Determine the test result
    test_result = 'Passed' if p < threshold else 'Failed'
    
    # Append the result to the list
    results.append((column, test_result))

# Print the results
print(f'{"Column":<25} | Test result')
print('----------------------------------------')
for column, test_result in results:
    print(f'{column:<25} |', '\033[32m' + test_result + '\033[0m' if test_result == 'Passed' else '\033[31m' + test_result + '\033[0m')

# Data Preprocessing

### Training data preprocessing

In [None]:
# Count and sort missing values in each column of the 'train_df' DataFrame in descending order
train_df.isnull().sum().sort_values(ascending=False)

In [None]:
# Identify the features with missing values
columns = []
for i in train_df.columns:
    if train_df[i].isnull().sum() > 0 :
        columns.append(i)
        print(i)
        print(train_df[i].unique())

In [None]:
columns

In [None]:
# Fill in the missing values using the mode
for i in columns:
    train_df[i].fillna(train_df[i].mode().iloc[0], inplace = True)
train_df.isnull().sum()

#### Encoding

In [None]:
# Convert boolean columns to 0 and 1
bool_col = ['surgery', 'surgical_lesion', 'cp_data']
for i in bool_col:
    train_df[i].replace('yes', int('1'), inplace = True)
    train_df[i].replace('no', int('0'), inplace = True)

In [None]:
# Map categorical values to numerical values
age_mapping = {'young': 1, 'adult': 2}
train_df['age'].replace(age_mapping, inplace = True)

In [None]:
# Define mapping dictionaries for other categorical columns
temp = {
    'cold': 0,
    'cool': 1,
    'normal': 2,
    'warm': 3
}
ppulse = {
    'absent': 0,
    'reduced': 1,
    'normal': 2,
    'increased': 3
}
memb = {
    'normal_pink': 1,
    'pale_pink': 2,
    'bright_pink': 3,
    'bright_red': 4,
    'pale_cyanotic': 5,
    'dark_cyanotic': 6
}
time = {
    'less_3_sec': 1,
    '3': 3,
     'more_3_sec': 5
}
pain = {
    'mild_pain': 1,   
    'depressed': 2,
    'moderate': 3,
    'alert': 4,
    'severe_pain': 5,
    'extreme_pain': 6,
    'slight': 7
}
peri = {
    'normal': 1,
    'hypermotile': 2,
    'hypomotile': 3,
    'distend_small': 4,
    'absent': 5
}
dist = {
    'none': 0,
    'slight': 1,
    'moderate': 2,
    'severe': 3
    }
tube = {
    'none': 0,
    'slight': 1,
    'significant': 2
}
refl = {
    'none': 0,
    'less_1_liter': 0.5,
    'slight': 1,
    'more_1_liter': 2
}
rect = {
    'absent': 0,
    'decreased': 1,
    'normal': 2,
    'increased': 3,
    'serosanguious': 4
}
abdm = {
    'firm': 1,
    'other': 2,
    'normal': 3,
    'distend_small': 4,
    'distend_large': 5}
look = {
    'clear': 0,
    'cloudy': 1,
    'serosanguious': 2
}

In [None]:
# Map the 'outcome' feature
outcome = {'died': 0, 'euthanized': 1, 'lived': 2}
train_df['outcome'].replace(outcome, inplace = True)

In [None]:
# Combine the arrays
objects = [
    ppulse,
    memb,
    time,
    pain,
    peri,
    dist,
    tube,
    refl,
    rect,
    abdm,
    look
]
obj_arr = np.array(temp)
for i in objects:
    now = np.array(i)
    obj_arr = np.append(obj_arr, now)
obj_arr

In [None]:
# Replace values in specific columns of the 'train_df' DataFrame with corresponding values from 'obj_arr'.
j = 0
for i in columns:
    train_df[i].replace(obj_arr[j], inplace = True)
    j += 1

### Test data preprocessing

#### Imputation

In [None]:
# Count and sort missing values in each column of the 'test_df' DataFrame in descending order
test_df.isnull().sum().sort_values(ascending=False)

In [None]:
# Identify the features with missing values
columns = []
for i in test_df.columns:
    if test_df[i].isnull().sum() > 0 :
        columns.append(i)
        print(i)
        print(test_df[i].unique())

In [None]:
columns

In [None]:
# Fill in the missing values using the mode
for i in columns:
    test_df[i].fillna(test_df[i].mode().iloc[0], inplace = True)
test_df.isnull().sum()

#### Encoding

In [None]:
# Convert boolean columns to 0 and 1
bool_col = ['surgery', 'surgical_lesion', 'cp_data']
for i in bool_col:
    test_df[i].replace('yes', int('1'), inplace = True)
    test_df[i].replace('no', int('0'), inplace = True)

In [None]:
# Map categorical values to numerical values
age_mapping = {'young': 1, 'adult': 2}
test_df['age'].replace(age_mapping, inplace = True)

In [None]:
# Replace values in specific columns of the 'test_df' DataFrame with corresponding values from 'obj_arr'.
j = 0
for i in columns:
    test_df[i].replace(obj_arr[j], inplace = True)
    j += 1

# Modeling

In [None]:
# Split the training data into features and target
X_train = train_df.drop(   
    columns = [
        'id', 
        'hospital_number',
        'outcome',
        'rectal_exam_feces',
        #'surgical_lesion'
    ]
)
y_train = train_df['outcome']

In [None]:
# Identify X_test
X_test = test_df.drop(   
    columns = [
        'id', 
        'hospital_number',
        'rectal_exam_feces',
        #'surgical_lesion'
    ]
)

In [None]:
best_hyperparams_xgb = {
    'eta': 0.2734096744203229,
    'n_estimators': 251,
    'max_depth': 1,
    'reg_lambda': 1.3536521735953297,
    'subsample': 0.9372043032806799,
    'min_child_weight': 5,
    'colsample_bytree': 0.32973413695986586,
    'objective': 'multi:softmax'
}
best_hyperparams_mlp = {
    'hidden_layer_sizes': (8,),  
    'max_iter': 146,
    'learning_rate_init': 0.09732455260435911
}
best_hyperparams_dt = {
    'max_depth': 5,           
    'min_samples_split': 2,   
    'min_samples_leaf': 1
}
best_hyperparams_lgbm = {
    'n_estimators': 146, 
    'learning_rate': 0.09732455260435911, 
    'max_depth': 8, 
    'num_leaves': 973, 
    'reg_lambda': 5.558974411222393, 
    'reg_alpha': 5.94913795893992, 
    'subsample': 0.057493821911338956, 
    'colsample_bytree': 0.7716515051686431, 
    'min_child_samples': 46, 
    'min_child_weight': 7, 
    'objective': 'multiclass', 
    'metric': 'multi_logloss', 
    'boosting_type': 'gbdt' 
}
best_hyperparams_svm = {
    'C': 1.0,                  
    'kernel': 'rbf',           
    'gamma': 'scale' 
}
best_hyperparams_knn = {
    'n_neighbors': 5,         
    'weights': 'uniform'
}
best_hyperparams_adaboost = {
    'n_estimators': 100,      
    'learning_rate': 1.0
}

In [None]:
models = [
    XGBClassifier(**best_hyperparams_xgb),
    MLPClassifier(**best_hyperparams_mlp),
    DecisionTreeClassifier(**best_hyperparams_dt),  
    LGBMClassifier(**best_hyperparams_lgbm),  
    SVC(**best_hyperparams_svm),  
    KNeighborsClassifier(**best_hyperparams_knn),  
    AdaBoostClassifier(**best_hyperparams_adaboost),  
]

In [None]:
for model in models:
    model.fit(X_train, y_train)

In [None]:
for model in models:
    train_score = f1_score(y_train, model.predict(X_train), average='micro')

    print(f'{model.__class__.__name__} micro F1 training score: {train_score:.3f}')

In [None]:
lgbm_classifier = LGBMClassifier(**best_hyperparams_lgbm)

In [None]:
# Fit the grid search to the training data
lgbm_classifier.fit(X_train, y_train)

# Making Predictions

In [None]:
# Make predictions on the test data
y_pred = lgbm_classifier.predict(X_test)

In [None]:
rdf = pd.DataFrame(y_pred, columns=['outcome'])
rdf.columns = ['outcome']
rdf['outcome'].replace(outcome, inplace = True)

In [None]:
outcome = {
    0: 'died',
    1: 'euthanized',
    2: 'lived'
}

In [None]:
rdf['outcome'] = rdf['outcome'].round().astype(int).map(outcome)

In [None]:
key = np.arange(1235, 2059).astype(int)
key = pd.DataFrame(key)
key.columns = ['id']
submission = pd.concat([key, rdf], axis = 1)

submission.sample(15, random_state=42)

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index = False, header = True)
# Displaying a success message
print("The submission has been successfully saved.")