In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Step 1: Preprocessing Function

def preprocess_input_data(df, treat_na=True, treat_outliers=True, train=True,
                          lower_quantile=0.01, upper_quantile=0.99):

    def drop_missing_columns(df, keep_threshold=0.3, drop_threshold=0.5):
        missing_percentage = df.isnull().mean()

        columns_to_drop = missing_percentage[missing_percentage >
                                             drop_threshold].index.tolist()
        df = df.drop(columns=columns_to_drop)

        for col in missing_percentage.index:
            if keep_threshold < missing_percentage[col] <= drop_threshold:
                user_input = input(f"Column '{col}' has {missing_percentage[col]:.2%} missing values. "
                                   f"Do you want to keep (K), drop (D), or treat (T) this column? ").strip().lower()
                if user_input == 'd':
                    df = df.drop(columns=[col])
                elif user_input == 't':
                    if df[col].dtype in ['float64', 'int64']:
                        df[col] = df[col].fillna(df[col].median())
                    else:
                        df[col] = df[col].fillna(df[col].mode()[0])

        return df

    # Step 1: Drop columns with excessive missing values
    df = drop_missing_columns(df)

    # Step 2: Identify numeric and categorical columns
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

    # Step 3: Normalize string categorical data
    for col in cat_cols:
        df[col] = df[col].str.lower()  # Convert to lowercase for consistency

    # Step 4: Handle missing values if specified
    if treat_na:
        df[num_cols] = df[num_cols].fillna(df[num_cols].median(
            numeric_only=True))  # Fill numeric with median

        for col in cat_cols:
            if col in df.columns and not df[col].isnull().all():
                # Fill categorical with mode
                df[col] = df[col].fillna(df[col].mode()[0])

    # Step 5: Identify and map binary columns
    binary_columns = []

    for col in num_cols:
        if df[col].nunique() == 2:
            binary_columns.append(col)
            df[col] = df[col].map(lambda x: 1 if x ==
                                  1 else 0)  # Map to 1 and 0

    for col in cat_cols:
        if df[col].nunique() == 2:
            binary_columns.append(col)
            df[col] = df[col].map(lambda x: 1 if x ==
                                  df[col].unique()[1] else 0)

    # Step 6: Outlier treatment if specified
    if treat_outliers:
        def cap_outliers(series, lower_q, upper_q):
            lower_bound = series.quantile(lower_q)
            upper_bound = series.quantile(upper_q)
            # Cap the outliers
            return series.clip(lower=lower_bound, upper=upper_bound)

        for col in num_cols:
            if col not in binary_columns:  # Skip binary columns
                df[col] = cap_outliers(df[col], lower_quantile, upper_quantile)

    # Step 7: Scaling numeric data
    scaler = StandardScaler()  # Initialize the scaler
    df[num_cols] = scaler.fit_transform(df[num_cols])  # Scale numeric columns

    # Return processed DataFrame, scaler, and binary column names
    return df, scaler, binary_columns

In [27]:
# Step 2: Model Training Function

from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import NotFittedError
import joblib

def train_model(df_train_X, df_train_Y):

    def save_model(model, scaler, model_path='./model/model.pkl', scaler_path='./model/scaler.pkl'):
        model_dir = os.path.dirname(model_path)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
            print(f"Directory '{model_dir}' created.")

        joblib.dump(model, model_path)
        joblib.dump(scaler, scaler_path)
        print(f"Model saved to {model_path} \nScaler saved to {scaler_path}")

    # Preprocess training data
    df_train_clean, scaler, _ = preprocess_input_data(
        df_train_X, treat_na=True, treat_outliers=True, lower_quantile=0.05, upper_quantile=0.95
    )

    # Extract feature matrix and target labels
    X_train = df_train_clean.iloc[:, 1:]  # Exclude ID column
    # Ensure that 'target' column exists in df_train_Y
    y_train = df_train_Y['target']

    # Initialize and train logistic regression model
    model = LogisticRegression()

    try:
        model.fit(X_train, y_train)
        print("Model trained successfully!")

        # Save the model and scaler
        save_model(model, scaler)
        
    except Exception as e:
        print(f"An error occurred during model training: {e}")
        raise
        
    return model, scaler

In [28]:
def predictor(input_df, model, scaler):

    # Check if the input DataFrame is empty
    if input_df.empty:
        raise ValueError("Input DataFrame is empty.")

    # Ensure the ID column is present
    if 0 not in input_df.columns:
        raise ValueError(
            "Input DataFrame must contain an ID column at index 0.")

    # Extract the ID column
    IDs = input_df.iloc[:, 0]  # Assuming the first column is ID

    # Preprocess the input data
    input_clean, scaler, _ = preprocess_input_data(
        input_df, treat_na=True, treat_outliers=True, train=False)

    # Check if there are any features to predict
    if input_clean.shape[1] < 2:
        raise ValueError(
            "Input DataFrame must have at least one feature column for prediction.")

    # Make predictions using the trained model
    predictions = model.predict(
        input_clean.iloc[:, 1:])  # Exclude the ID column

    # Return a DataFrame with the ID and predicted values
    return pd.DataFrame({
        'ID': IDs,
        'Predicted': predictions
    })

In [29]:
df_train_X = pd.read_csv("./train/Train_60/X_train_Data_Input.csv")
df_train_Y = pd.read_csv("./train/Train_60/Y_train_Data_Target.csv")
df_test_X = pd.read_csv('./test/Test_20/X_Test_Data_Input.csv')
df_test_Y = pd.read_csv("./test/Test_20/Y_Test_Data_Target.csv")

In [30]:
# # Rename columns for df_train_X and df_test_X
# df_train_X.columns = ['ID'] + [str(i) for i in range(df_train_X.shape[1] - 1)]
# df_test_X.columns = ['ID'] + [str(i) for i in range(df_test_X.shape[1] - 1)]

# Rename columns for df_train_X and df_test_X to integers
df_train_X.columns = range(df_train_X.shape[1])  # 0, 1, 2, ...
df_test_X.columns = range(df_test_X.shape[1])

In [31]:
df_train_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
0,ad1a67e4cbddc767a3456b0d94299b9e,2.0,2495,3726.0,0.678139,0.701403,-0.007468,0.43419,-0.015603,0.606265,,0,0,0,0,0.001351,0.00339,0.0,0,0.0,0,0,0
1,7246d2f76ac0c217ec25e72ea5f014cb,0.0,2495,3454.0,0.45258,0.701403,-0.007468,1.554998,-0.015574,0.329946,,0,0,0,0,0.001351,0.00339,0.0,0,0.0,0,0,0
2,22ba388e7dd14c13342c49e75fc29dda,2.0,2495,4543.0,-1.577453,-1.42954,-0.007469,-0.407939,-0.015607,-0.774979,,1,1,1,1,0.001351,0.00339,0.0,0,0.0,0,0,0
3,59f9b981472d97342587fb3e6392aeb1,0.0,211,59.0,,,,-0.407939,-0.015607,-0.774979,,0,0,0,0,,0.00339,0.0,0,1.0,0,0,0
4,f6317cf7ecf126859804eddff279aead,0.0,718,950.0,-2.028572,-1.855728,,-0.407939,-0.015607,-0.774979,,0,0,0,0,,0.00339,0.0,0,0.0,0,0,0


In [32]:
# # Preprocess training data
# df_train_clean, scaler, binary_columns = preprocess_input_data(
#     df_train_X, treat_na=True, treat_outliers=True, lower_quantile=0.05, upper_quantile=0.95
# )

# print("\nCleaned-Normalised-Training Data:")
# df_train_clean.head()

In [33]:
# # Preprocess test data using same scaler
# df_test_clean, _, _ = preprocess_input_data(
#     df_test_X, treat_na=True, treat_outliers=True, lower_quantile=0.05, upper_quantile=0.95
# )

# print("\nCleaned-Normalised-Test Data:")
# df_test_clean.head()

In [35]:
# Train the model
model, scaler = train_model(df_train_X, df_train_Y)

Model trained successfully!
Model saved to ./model/model.pkl 
Scaler saved to ./model/scaler.pkl


In [36]:
# Make predictions on the test data
predicted_df = predictor(df_test_X, model, scaler)
predicted_df.head()

Unnamed: 0,ID,Predicted
0,07cf2025382f6325b316e128b1b90999,0
1,eb972eb3a1f8d0d1a13f45e7c07d37d4,0
2,ee35e164b3ddc25a9f40243b81ad290d,0
3,28229ccd7bad7dd83324a4175a7e0531,0
4,2f94873da2c332d28f111742818e0fbb,0


In [37]:
predicted_df['Predicted'].value_counts()

Predicted
0    242756
1     18956
Name: count, dtype: int64

In [38]:
from sklearn import metrics

In [40]:
df_test_Y.head()

Unnamed: 0,ID,target
0,07cf2025382f6325b316e128b1b90999,0
1,eb972eb3a1f8d0d1a13f45e7c07d37d4,0
2,ee35e164b3ddc25a9f40243b81ad290d,0
3,28229ccd7bad7dd83324a4175a7e0531,0
4,2f94873da2c332d28f111742818e0fbb,0


In [41]:
# Classification Report on test data
report_test = metrics.classification_report(
    y_true=df_test_Y['target'], y_pred=predicted_df['Predicted'])
print(report_test)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98    237034
           1       0.85      0.66      0.74     24678

    accuracy                           0.96    261712
   macro avg       0.91      0.82      0.86    261712
weighted avg       0.95      0.96      0.95    261712



1. ** Input Validation**: Add checks to ensure that the input DataFrames(`df_train_Y` and `df_test_X`) have the correct structure and necessary columns.

2. ** User Interaction in `drop_missing_columns`** : Consider making user prompts optional or implement a logging mechanism for smoother user experience, especially for batch processing.

3. ** Error Logging**: Use the `logging` module instead of printing errors to facilitate easier debugging in larger applications.

4. ** Model Persistence**: Save the trained model using libraries like `joblib` or `pickle` to reuse it without needing to retrain.

5. ** Function Flexibility**: Parameterize additional aspects of preprocessing or model training to accommodate various datasets and use cases.

6. ** Enhance Documentation**: Ensure all functions are well-documented with clear descriptions of parameters and return values to improve usability.

7. ** Cross-Validation**: Implement cross-validation in the model training process to evaluate model performance more robustly.

8. ** Hyperparameter Tuning**: Consider adding a mechanism for hyperparameter tuning(e.g., using GridSearchCV) to optimize model performance.

9. ** Output Verification**: Include assertions or checks to verify the shape and content of outputs at various stages, ensuring consistency.

10. ** Testing**: Develop unit tests for each function to verify their functionality and correctness, enhancing reliability.

In [4]:
def cleanFeatures(df):

    rows, cols = df.shape[0], df.shape[1]
    
    new_col_names = [f"c{x}" for x in range(cols-1)]
    df.columns = [df.columns[0]] + new_col_names



    return df

In [None]:
cleanFeatures(df_train_X).head()


In [None]:
df_train_X = cleanFeatures(df_train_X)
df_train_X