In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import joblib
import logging

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [5]:
# Data Preprocessing Phase
pd.set_option('display.max_columns', None)
d1 = pd.read_csv("Toddler Autism dataset July 2018.csv")
print(d1.head())

print(d1.isna().sum())

# Convert Age_Mons to years
d1["Age_Mons"] = (d1["Age_Mons"]/12).astype(int)
print(d1.head())

# Drop Case_No and Qchat-10-Score
d1 = d1.drop(["Case_No", "Qchat-10-Score"], axis=1)
print(d1.head())


   Case_No  A1  A2  A3  A4  A5  A6  A7  A8  A9  A10  Age_Mons  Qchat-10-Score  \
0        1   0   0   0   0   0   0   1   1   0    1        28               3   
1        2   1   1   0   0   0   1   1   0   0    0        36               4   
2        3   1   0   0   0   0   0   1   1   0    1        36               4   
3        4   1   1   1   1   1   1   1   1   1    1        24              10   
4        5   1   1   0   1   1   1   1   1   1    1        20               9   

  Sex       Ethnicity Jaundice Family_mem_with_ASD Who completed the test  \
0   f  middle eastern      yes                  no          family member   
1   m  White European      yes                  no          family member   
2   m  middle eastern      yes                  no          family member   
3   m        Hispanic       no                  no          family member   
4   f  White European       no                 yes          family member   

  Class/ASD Traits   
0                No  
1     

In [6]:
# Encode categorical columns using LabelEncoder
categorical_cols = ['Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who completed the test', 'Class/ASD Traits ']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    d1[col] = le.fit_transform(d1[col]).astype(np.int8)
    le_dict[col] = le
    print(f"Encoded column {col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")


Encoded column Sex: {'f': 0, 'm': 1}
Encoded column Ethnicity: {'Hispanic': 0, 'Latino': 1, 'Native Indian': 2, 'Others': 3, 'Pacifica': 4, 'White European': 5, 'asian': 6, 'black': 7, 'middle eastern': 8, 'mixed': 9, 'south asian': 10}
Encoded column Jaundice: {'no': 0, 'yes': 1}
Encoded column Family_mem_with_ASD: {'no': 0, 'yes': 1}
Encoded column Who completed the test: {'Health Care Professional': 0, 'Health care professional': 1, 'Others': 2, 'Self': 3, 'family member': 4}
Encoded column Class/ASD Traits : {'No': 0, 'Yes': 1}


In [7]:
# Save LabelEncoder mappings
joblib.dump(le_dict, 'autism_label_encoders.pkl')
logger.info("Saved LabelEncoder mappings to models/autism/label_encoders.pkl")

INFO:__main__:Saved LabelEncoder mappings to models/autism/label_encoders.pkl


In [8]:
# Split features and target
X = d1.drop("Class/ASD Traits ", axis=1)
y = d1["Class/ASD Traits "]
logger.info(f"Features: {X.columns.tolist()}")

# Initialize StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
logger.info("Applied StandardScaler to features")

INFO:__main__:Features: ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons', 'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who completed the test']
INFO:__main__:Applied StandardScaler to features


In [9]:
# Save scaler
scaler_filename = 'autism_scaler.pkl'
joblib.dump(sc, scaler_filename)
logger.info(f"Saved scaler to {scaler_filename}")


INFO:__main__:Saved scaler to autism_scaler.pkl


In [10]:
# Split the scaled data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=69, stratify=y
)
logger.info("Split data into 80% train and 20% test")

# Define training and evaluation function
def train_model(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix is: ")
    print(matrix)
    print("\n\nScore Table is: ")
    score_df = pd.DataFrame([[accuracy, precision, recall, f1, roc_auc]], 
                            columns=["accuracy", "precision", "recall", "f1", "roc_auc"])
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    print(f"{model_name} 5-Fold CV F1 Scores: {cv_scores}")
    print(f"{model_name} Mean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    return score_df

# Train and tune RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf = RandomForestClassifier(random_state=69, class_weight='balanced')
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
print(f"Best RandomForest parameters: {grid_search.best_params_}")

# Evaluate RandomForest
rf_result = train_model(best_rf, X_train, y_train, X_test, y_test, "RandomForest")
rf_result.index = ["RandomForest"]
print(rf_result)

INFO:__main__:Split data into 80% train and 20% test


Best RandomForest parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Confusion Matrix is: 
[[ 61   4]
 [  4 142]]


Score Table is: 
RandomForest 5-Fold CV F1 Scores: [0.9787234  0.97413793 0.96610169 0.95726496 0.98712446]
RandomForest Mean CV F1: 0.9727 (+/- 0.0206)
              accuracy  precision    recall        f1   roc_auc
RandomForest  0.962085   0.972603  0.972603  0.972603  0.993572


In [20]:
# Train and evaluate LogisticRegression as a backup
lr = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=69)
lr_result = train_model(lr, X_train, y_train, X_test, y_test, "LogisticRegression")
lr_result.index = ["LogisticRegression"]
print(lr_result)

# Combine results
results = pd.concat([rf_result, lr_result])
print("\nModel Comparison:\n")
print(results)

#Save the best model (RandomForest)
model_filename = 'autism_rf_model.pkl'
joblib.dump(best_rf, model_filename)
print(f"Model saved to {model_filename}")

# Save LogisticRegression as a backup
# backup_model_filename = 'models/autism/autism_lr_model.pkl'
# joblib.dump(lr, backup_model_filename)
# print(f"Saved LogisticRegression model to {backup_model_filename}")

# Feature importance (for RandomForest)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_rf.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:\n")
print(feature_importance)

Confusion Matrix is: 
[[ 65   0]
 [  0 146]]


Score Table is: 
LogisticRegression 5-Fold CV F1 Scores: [1. 1. 1. 1. 1.]
LogisticRegression Mean CV F1: 1.0000 (+/- 0.0000)
                    accuracy  precision  recall   f1  roc_auc
LogisticRegression       1.0        1.0     1.0  1.0      1.0

Model Comparison:

                    accuracy  precision    recall        f1   roc_auc
RandomForest        0.962085   0.972603  0.972603  0.972603  0.993572
LogisticRegression  1.000000   1.000000  1.000000  1.000000  1.000000
Model saved to autism_rf_model.pkl

Feature Importance:

                   Feature  Importance
8                       A9    0.188722
4                       A5    0.159678
5                       A6    0.110346
6                       A7    0.099607
0                       A1    0.092827
1                       A2    0.081727
3                       A4    0.067003
7                       A8    0.047977
12               Ethnicity    0.038654
2                       A3 

In [13]:
best_rf

In [None]:
# Load LabelEncoder mappings, scaler, and model
try:
    le_dict = joblib.load('autism_label_encoders.pkl')
    scaler = joblib.load('autism_scaler.pkl')
    best_rf = joblib.load('autism_rf_model.pkl')
    logger.info("Loaded LabelEncoder mappings, scaler, and RandomForest model")
except FileNotFoundError as e:
    logger.error(f"Error loading files: {str(e)}")
    raise e

# Define function to preprocess input data
def preprocess_input(input_data, le_dict, scaler):
    # Convert input to DataFrame if it's a dictionary
    if isinstance(input_data, dict):
        input_df = pd.DataFrame([input_data])
    else:
        input_df = input_data.copy()

    # Convert Age_Mons to years
    if 'Age_Mons' in input_df.columns:
        input_df['Age_Mons'] = (input_df['Age_Mons'] / 12).astype(int)

    # Encode categorical columns
    categorical_cols = ['Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who completed the test']
    for col in categorical_cols:
        if col in input_df.columns:
            le = le_dict[col]
            # Handle unseen labels by mapping to a default
            input_df[col] = input_df[col].apply(lambda x: x if x in le.classes_ else le.classes_[0])
            input_df[col] = le.transform(input_df[col]).astype(np.int8)

    # Ensure all expected features are present
    expected_features = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 
                        'Age_Mons', 'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 
                        'Who completed the test']
    for feature in expected_features:
        if feature not in input_df.columns:
            input_df[feature] = 0  # Default value for missing features

    # Reorder columns to match training data
    input_df = input_df[expected_features]

    # Scale the features
    input_scaled = scaler.transform(input_df)
    return input_scaled

# Define function to predict probabilities
def predict_autism_proba(input_data, model, le_dict, scaler):
    # Preprocess the input
    input_scaled = preprocess_input(input_data, le_dict, scaler)
    
    # Predict probabilities
    proba = model.predict_proba(input_scaled)
    
    # Create result DataFrame
    result = pd.DataFrame({
        'No Autism (Prob)': proba[:, 0],
        'Autism (Prob)': proba[:, 1]
    })
    return result

sample_input = {
    'A1': 0, 'A2': 0, 'A3': 0, 'A4': 0, 'A5': 0,
    'A6': 0, 'A7': 1, 'A8': 0, 'A9': 0, 'A10': 1,
    'Age_Mons': 36,
    'Sex': 'm',
    'Ethnicity': 'asian',
    'Jaundice': 'no',
    'Family_mem_with_ASD': 'no',
    'Who completed the test': 'family member'
}

# Predict probabilities for the sample input
try:
    predictions = predict_autism_proba(sample_input, best_rf, le_dict, scaler)
    logger.info("Prediction completed successfully")
    print("\nPrediction Probabilities:")
    print(predictions)
except Exception as e:
    logger.error(f"Error during prediction: {str(e)}")
    raise e

INFO:__main__:Loaded LabelEncoder mappings, scaler, and RandomForest model
INFO:__main__:Prediction completed successfully



Prediction Probabilities:
   No Autism (Prob)  Autism (Prob)
0           0.99971        0.00029


In [23]:
# Load LabelEncoder mappings, scaler, and model
try:
    le_dict = joblib.load('autism_label_encoders.pkl')
    scaler = joblib.load('autism_scaler.pkl')
    best_rf = joblib.load('autism_rf_model.pkl')
    logger.info("Loaded LabelEncoder mappings, scaler, and RandomForest model")
except FileNotFoundError as e:
    logger.error(f"Error loading files: {str(e)}")
    raise e

# Define function to preprocess input data
def preprocess_input(input_data, le_dict, scaler):
    # Convert input to DataFrame if it's a dictionary
    if isinstance(input_data, dict):
        input_df = pd.DataFrame([input_data])
    else:
        input_df = input_data.copy()

    # Convert Age_Mons to years
    if 'Age_Mons' in input_df.columns:
        input_df['Age_Mons'] = (input_df['Age_Mons'] / 12).astype(int)

    # Encode categorical columns
    categorical_cols = ['Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who completed the test']
    for col in categorical_cols:
        if col in input_df.columns:
            le = le_dict[col]
            # Handle unseen labels by mapping to a default
            input_df[col] = input_df[col].apply(lambda x: x if x in le.classes_ else le.classes_[0])
            input_df[col] = le.transform(input_df[col]).astype(np.int8)

    # Ensure all expected features are present
    expected_features = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 
                        'Age_Mons', 'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 
                        'Who completed the test']
    for feature in expected_features:
        if feature not in input_df.columns:
            input_df[feature] = 0  # Default value for missing features

    # Reorder columns to match training data
    input_df = input_df[expected_features]

    # Scale the features
    input_scaled = scaler.transform(input_df)
    return input_scaled

# Define function to predict probabilities
def predict_autism_proba(input_data, model, le_dict, scaler):
    # Preprocess the input
    input_scaled = preprocess_input(input_data, le_dict, scaler)
    
    # Predict probabilities
    proba = model.predict_proba(input_scaled)
    
    # Create result DataFrame
    result = pd.DataFrame({
        'No Autism (Prob)': proba[:, 0],
        'Autism (Prob)': proba[:, 1]
    })
    return result

sample_input = {
    'A1': 1, 'A2': 1, 'A3': 0, 'A4': 0, 'A5': 1, 
    'A6': 1, 'A7': 0, 'A8': 1, 'A9': 0, 'A10': 1,
    'Age_Mons': 24,  # Age in months
    'Sex': 'm',  # 'm' or 'f'
    'Ethnicity': 'White European',
    'Jaundice': 'yes',  # 'yes' or 'no'
    'Family_mem_with_ASD': 'no',  # 'yes' or 'no'
    'Who completed the test': 'Parent'
}
# Predict probabilities for the sample input
try:
    predictions = predict_autism_proba(sample_input, best_rf, le_dict, scaler)
    logger.info("Prediction completed successfully")
    print("\nPrediction Probabilities:")
    print(predictions)
except Exception as e:
    logger.error(f"Error during prediction: {str(e)}")
    raise e


INFO:__main__:Loaded LabelEncoder mappings, scaler, and RandomForest model
INFO:__main__:Prediction completed successfully



Prediction Probabilities:
   No Autism (Prob)  Autism (Prob)
0              0.03           0.97
