<a href="https://colab.research.google.com/github/hellojohnkim/mmai894/blob/main/A2NN_4_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from category_encoders import BinaryEncoder
import tensorflow as tf
from hyperopt import hp
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from hyperopt import STATUS_OK, fmin, tpe, Trials, atpe, space_eval
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)

In [None]:
#run this cell if you get denied running the data loading cell
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Specifying the datasets file paths
training_set_values_file_path = '/content/drive/MyDrive/MMAI24_Chester/894_team/DrivenData_Competition/notebooks/Naevin/training_set_values.csv'
training_set_labels_file_path = '/content/drive/MyDrive/MMAI24_Chester/894_team/DrivenData_Competition/notebooks/Naevin/training_set_label.csv'
test_set_file_path = '/content/drive/MyDrive/MMAI24_Chester/894_team/DrivenData_Competition/notebooks/Naevin/test_set.csv'

In [None]:
features_df = pd.read_csv(training_set_values_file_path)
labels_df = pd.read_csv(training_set_labels_file_path)
test = pd.read_csv(test_set_file_path)

In [None]:
labels_df['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [None]:
data_df = features_df.merge(labels_df, on='id')

In [None]:
def calculate_well_age(row):
    if row['construction_year'] > 0:
        return row['date_recorded'].year - row['construction_year']
    else:
        return np.nan

data_df['date_recorded'] = pd.to_datetime(data_df['date_recorded'])
data_df['year_recorded'] = data_df['date_recorded'].dt.year
data_df['month_recorded'] = data_df['date_recorded'].dt.month
data_df['day_recorded'] = data_df['date_recorded'].dt.day
data_df['days_since_recorded'] = (data_df['date_recorded'] - data_df['date_recorded'].min()).dt.days
data_df['well_age'] = data_df.apply(calculate_well_age, axis=1)

data_df.drop('date_recorded', axis=1, inplace=True)

def encode_cyclical_features(df, cols):
    for col in cols:
        max_val = df[col].max()
        df[col + '_sin'] = np.sin(2 * np.pi * df[col] / max_val)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col] / max_val)
    return df

cyclical_cols = ['year_recorded', 'month_recorded', 'day_recorded']

data_df = encode_cyclical_features(data_df, cyclical_cols)
data_df.drop(['month_recorded', 'day_recorded'], axis=1, inplace=True)

In [None]:
mixed_type_columns = ['funder', 'installer', 'subvillage',
                      'public_meeting', 'scheme_management',
                      'scheme_name', 'permit']

for col in mixed_type_columns:
    data_df[col] = data_df[col].astype(str)

data_df.drop('id', axis = 1, inplace=True)
X = data_df.drop('status_group', axis=1)
y = data_df['status_group']

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5, )),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', BinaryEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X_transformed = preprocessor.fit_transform(X)

transformed_numeric_features = numeric_features
transformed_categorical_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)

all_transformed_features = list(transformed_numeric_features) + list(transformed_categorical_features)
X = pd.DataFrame(X_transformed, columns=all_transformed_features)

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)
y = pd.DataFrame(y)

In [None]:
#DO NOT RUN
space = {
    'units_input': hp.choice('units_input', [32, 64, 128, 256, 512]),
    'num_layers': hp.choice('num_layers', [4, 5, 6, 7, 8, 9, 10, 15, 30]),
    'units_hidden': hp.choice('units_hidden', [32, 64, 128, 256, 512]),
    'dropout': hp.uniform('dropout', 0.0, 0.5),
    'learning_rate': hp.loguniform('learning_rate', np.log(1e-4), np.log(1e-2)),
    'batch_size': hp.choice('batch_size', [32, 64, 128]),
    'epochs': hp.choice('epochs', [10, 20, 30, 40, 50])
}

def objective(params):
    model = Sequential()
    model.add(Dense(params['units_input'], activation='relu', input_shape=(X.shape[1],)))

    for i in range(params['num_layers']):
        model.add(Dense(params['units_hidden'], activation='relu'))
        model.add(Dropout(params['dropout']))

    model.add(Dense(len(np.unique(y)), activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=6, mode='min', verbose=False)
    history = model.fit(X, y, epochs=params['epochs'], batch_size=params['batch_size'], validation_split=0.2, callbacks=[early_stopping], verbose=False)

    validation_accuracy = max(history.history['val_accuracy'])

    return {'loss': -validation_accuracy, 'status': STATUS_OK}

trials = Trials()

best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=150,
    trials=trials,
    show_progressbar=True,
    verbose=True,
)

print("\nOverall Best hyperparameters:")
print(space_eval(space, best))

100%|██████████| 150/150 [2:25:04<00:00, 58.03s/trial, best loss: -0.7865319848060608]

Overall Best hyperparameters:
{'batch_size': 128, 'dropout': 0.2675233085503283, 'epochs': 10, 'learning_rate': 0.00013539980858461663, 'num_layers': 6, 'units_hidden': 512, 'units_input': 512}


In [None]:
bestv2 = {'batch_size': 128,
 'dropout': 0.2675233085503283,
 'epochs': 10,
 'learning_rate': 0.00013539980858461663,
 'num_layers': 6,
 'units_hidden': 512,
 'units_input': 512}


In [None]:
bestv1 = {
    'batch_size': 32,
    'dropout': 0.18789225010444455,
    'epochs': 30,
    'learning_rate': 0.0012051040665326852,
    'num_layers': 5,
    'units_hidden': 64,
    'units_input': 512
}

In [None]:
#  params for trials 4 to 9

#Tu
model_1_params = {
    'batch_size': 256,
    'dropout': 0.5,
    'epochs': 5,
    'learning_rate': 0.001,
    'num_layers': 3,
    'units_hidden': 256,
    'units_input': 512
}

#Tu
model_2_params = {
    'batch_size': 224,
    'dropout': 0.4,
    'epochs': 6,
    'learning_rate': 0.0005,
    'num_layers': 4,
    'units_hidden': 384,
    'units_input': 512
}

#Susan
model_3_params = {
    'batch_size': 192,
    'dropout': 0.35,
    'epochs': 7,
    'learning_rate': 0.0003,
    'num_layers': 5,
    'units_hidden': 448,
    'units_input': 512
}

#Susan
model_4_params = {
    'batch_size': 160,
    'dropout': 0.32,
    'epochs': 8,
    'learning_rate': 0.0002,
    'num_layers': 6,
    'units_hidden': 480,
    'units_input': 512
}

#Naevin
model_5_params = {
    'batch_size': 144,
    'dropout': 0.3,
    'epochs': 9,
    'learning_rate': 0.00018,
    'num_layers': 6,
    'units_hidden': 496,
    'units_input': 512
}

#Naevin
model_6_params = {
    'batch_size': 136,
    'dropout': 0.28,
    'epochs': 10,
    'learning_rate': 0.00014,
    'num_layers': 6,
    'units_hidden': 504,
    'units_input': 512
}


In [None]:
from tensorflow.keras.layers import Dense, BatchNormalization
#DEEP LEARNING MODEL
best_params = {'batch_size': 128,
 'dropout': 0.2675233085503283,
 'epochs': 10,
 'learning_rate': 0.00013539980858461663,
 'num_layers': 6,
 'units_hidden': 512,
 'units_input': 512}

model = Sequential()
model.add(Dense(best_params['units_input'], activation='relu', input_shape=(X.shape[1],)))

for _ in range(best_params['num_layers']):
    model.add(Dense(best_params['units_hidden'], activation='relu'))
    model.add(Dropout(best_params['dropout']))


model.add(Dense(len(np.unique(y)), activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X, y,
                    epochs=best_params['epochs'],
                    batch_size=best_params['batch_size'],
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X, y)
print(f'Accuracy: {accuracy*100:.2f}%')

Accuracy: 84.39%


In [None]:
# Predict the classes
y_pred = model.predict(X)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predictions to class labels



In [None]:
from sklearn.metrics import confusion_matrix, f1_score, classification_report
import numpy as np
import matplotlib.pyplot as plt
# Compute the confusion matrix
conf_matrix = confusion_matrix(y, y_pred_classes)
print("Confusion Matrix:")
print(conf_matrix)

# Compute the F1 score
f1 = f1_score(y, y_pred_classes, average='weighted')  # Use 'weighted' for multi-class classification
print(f"F1 Score: {f1:.2f}")

Confusion Matrix:
[[29174   328  2757]
 [ 2109  1660   548]
 [ 3337   193 19294]]
F1 Score: 0.84


In [None]:
print("Classification Report:")
print(classification_report(y, y_pred_classes))

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87     32259
           1       0.76      0.38      0.51      4317
           2       0.85      0.85      0.85     22824

    accuracy                           0.84     59400
   macro avg       0.82      0.71      0.74     59400
weighted avg       0.84      0.84      0.84     59400



In [None]:
#test = pd.read_csv('test_set.csv')

In [None]:
def calculate_well_age(row):
    if row['construction_year'] > 0:
        return row['date_recorded'].year - row['construction_year']
    else:
        return np.nan

test['date_recorded'] = pd.to_datetime(test['date_recorded'])
test['year_recorded'] = test['date_recorded'].dt.year
test['month_recorded'] = test['date_recorded'].dt.month
test['day_recorded'] = test['date_recorded'].dt.day
test['days_since_recorded'] = (test['date_recorded'] - test['date_recorded'].min()).dt.days
test['well_age'] = test.apply(calculate_well_age, axis=1)

test.drop('date_recorded', axis=1, inplace=True)

def encode_cyclical_features(df, cols):
    for col in cols:
        max_val = df[col].max()
        df[col + '_sin'] = np.sin(2 * np.pi * df[col] / max_val)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col] / max_val)
    return df

cyclical_cols = ['year_recorded', 'month_recorded', 'day_recorded']

test = encode_cyclical_features(test, cyclical_cols)
test.drop(['month_recorded', 'day_recorded'], axis=1, inplace=True)

In [None]:
mixed_type_columns = ['funder', 'installer', 'subvillage',
                      'public_meeting', 'scheme_management',
                      'scheme_name', 'permit']

for col in mixed_type_columns:
    test[col] = test[col].astype(str)

test.drop('id', axis = 1, inplace=True)

In [None]:
X_transformed = preprocessor.transform(test)

transformed_numeric_features = numeric_features
transformed_categorical_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)

all_transformed_features = list(transformed_numeric_features) + list(transformed_categorical_features)
test = pd.DataFrame(X_transformed, columns=all_transformed_features)

In [None]:
predictions = model.predict(test, batch_size=128)



In [None]:
test = pd.read_csv(test_set_file_path)

In [None]:
predicted_classes = np.argmax(predictions, axis=1)

In [None]:
predicted_classes = le.inverse_transform(predicted_classes)

In [None]:
final = pd.DataFrame({'id': test.id, 'status_group': predicted_classes})

In [None]:
# Save the DataFrame to a CSV file, Edit the below path to your personal folder and submit
final.to_csv('/content/drive/MyDrive/MMAI24_Chester/894_team/DrivenData_Competition/notebooks/Naevin/DeepLearning Submissions/SubmissionFormatDeep10.csv', index=False)

In [None]:
#final.to_csv('submission3.csv', index=False)

In [None]:
#final

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
...,...,...
14845,39307,non functional
14846,18990,functional
14847,28749,functional
14848,33492,functional
