In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/My Drive/diabetes+dataset.csv')
train_data = pd.read_csv('/content/drive/My Drive/train.csv')
test_data = pd.read_csv('/content/drive/My Drive/test.csv')




Mounted at /content/drive


In [2]:
# Split train data into features and target
X = train_data.drop(['id', 'Response'], axis=1)  # Drop 'id' and 'Response'
y = train_data['Response']

# Extract test data ids and features
test_ids = test_data['id']
X_test = test_data.drop(['id'], axis=1)

# Display datasets
print(train_data.head())
print(test_data.head())

   id  Gender  Age  Driving_License  Region_Code  Previously_Insured  \
0   0       0   30                1         28.0                   1   
1   1       0   26                1          6.0                   1   
2   2       0   40                1          0.0                   0   
3   3       0   25                1          8.0                   1   
4   4       1   26                1         28.0                   1   

   Vehicle_Age  Vehicle_Damage  Annual_Premium  Policy_Sales_Channel  Vintage  \
0            2               0         60954.0                 152.0      127   
1            2               0         24532.0                 152.0      216   
2            1               1          2630.0                  47.0      220   
3            2               0         44259.0                 152.0      223   
4            2               0         33615.0                 152.0      194   

   Response  
0         0  
1         0  
2         0  
3         0  
4         

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Define models
models = [
    {"name": "Decision Tree Gini", "model": DecisionTreeClassifier(criterion='gini', random_state=42)},
    {"name": "Decision Tree Entropy", "model": DecisionTreeClassifier(criterion='entropy', random_state=42)},
    {"name": "Random Forest 10 Estimators", "model": RandomForestClassifier(n_estimators=10, random_state=42)},
    {"name": "Random Forest 50 Estimators", "model": RandomForestClassifier(n_estimators=50, random_state=42)},
    {"name": "KNN K=5", "model": KNeighborsClassifier(n_neighbors=5)},
    {"name": "KNN K=10", "model": KNeighborsClassifier(n_neighbors=10)},
]


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random

# Results storage
results = []
full_summary = ""

# Perform 20 tests for each model
for test_num in range(1, 11):  # 10 tests
    print(f"Running Test #{test_num}")

    # Vary the train-test split size randomly between 0.2 and 0.4
    test_size = random.uniform(0.2, 0.4)

    # Vary the random state for train-test split
    random_state = random.randint(1, 100)

    # Split the dataset
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=test_size, random_state=random_state)

    # Concise summary for this test
    concise_summary = f"Test #{test_num}\nModel\tParameters\tValidation Accuracy\n"

    # Loop through models with varying parameters
    for m in models:
        # Modify model parameters for more variety
        if m["name"].startswith("Random Forest"):
            n_estimators = random.choice([10, 50, 100])
            model = m["model"].set_params(n_estimators=n_estimators)
        elif m["name"].startswith("Decision Tree"):
            criterion = random.choice(["gini", "entropy"])
            model = m["model"].set_params(criterion=criterion)
        elif m["name"].startswith("KNN"):
            n_neighbors = random.choice([5, 10, 15])
            model = m["model"].set_params(n_neighbors=n_neighbors)
        else:
            model = m["model"]

        # Train the model
        model.fit(X_train, y_train)

        # Validate the model
        y_val_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_val_pred)

        # Predict on test data
        predictions = model.predict(X_test)

        # Save predictions for submission
        submission_path = f'/content/drive/MyDrive/tests/{test_num}_{m["name"].replace(" ", "_")}_Test.csv'
        submission = pd.DataFrame({'id': test_ids, 'Response': predictions})
        submission.to_csv(submission_path, index=False)

        # Append concise summary
        concise_summary += f"{m['name']}\t{model.get_params()}\t{accuracy}\n"

        # Store full details in summary
        full_summary += (
            f"Test #{test_num} Full Summary\n"
            f"Model: {m['name']}\n"
            f"Parameters: {model.get_params()}\n"
            f"Validation Accuracy: {accuracy:.2f}\n"
            f"Test Size: {round(test_size, 2)}\n"
            f"Random State: {random_state}\n"
            f"Submission File: {submission_path}\n\n"
        )

        # Store results
        results.append({
            "Test Number": test_num,
            "Model": m["name"],
            "Parameters": model.get_params(),
            "Validation Accuracy": accuracy,
            "Test Size": round(test_size, 2),
            "Random State": random_state,
            "Submission File": submission_path
        })

    # Save concise summary to a text file
    with open(f"/content/drive/MyDrive/tests/Test_{test_num}_Summary.txt", "w") as f:
        f.write(concise_summary)

# Save full summary to a text file
with open("/content/drive/MyDrive/tests/Full_Summary.txt", "w") as f:
    f.write(full_summary)





Running Test #1
Running Test #2
Running Test #3
Running Test #4
Running Test #5
Running Test #6
Running Test #7
Running Test #8
Running Test #9
Running Test #10
