In [7]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report





*   List item
*   List item


```
# This is formatted as code
```



In [8]:
# Generate dataset with 5000 records, 5 features, binary labels
X, y = make_classification(
    n_samples=5000,
    n_features=5,
    n_informative=3,
    n_redundant=1,
    n_classes=2,
    random_state=42
)

# Convert to DataFrame
feature_names = [f"feature_{i}" for i in range(1, 6)]
df = pd.DataFrame(X, columns=feature_names)
df["label"] = y

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (5000, 6)


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,label
0,1.379235,1.200567,2.32026,-0.577565,1.168878,1
1,0.307273,-0.799876,0.725749,0.757629,0.64221,1
2,1.351406,0.786552,1.912284,-0.709965,0.772449,1
3,0.433977,-0.367866,-0.509308,1.538812,-0.987807,1
4,-1.293903,-1.970693,-1.16202,-0.236831,0.200332,0


In [9]:
# Split into Train (80%) and Test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    df[feature_names], df["label"], test_size=0.2, random_state=42
)

# Train logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.905

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.90      0.91       518
           1       0.90      0.91      0.90       482

    accuracy                           0.91      1000
   macro avg       0.90      0.91      0.90      1000
weighted avg       0.91      0.91      0.91      1000



In [10]:
# First split Train + Temp
X_train, X_temp, y_train, y_temp = train_test_split(
    df[feature_names], df["label"], test_size=0.3, random_state=42
)

# Split Temp into Validation + Test
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print("Shapes:")
print("Train:", X_train.shape)
print("Validation:", X_valid.shape)
print("Test:", X_test.shape)

# Train model on training data
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Validation accuracy
y_val_pred = model.predict(X_valid)
print("Validation Accuracy:", accuracy_score(y_valid, y_val_pred))


Shapes:
Train: (3500, 5)
Validation: (750, 5)
Test: (750, 5)
Validation Accuracy: 0.9


In [11]:
# Logistic regression with K-Fold
model = LogisticRegression(max_iter=1000)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model, df[feature_names], df["label"], cv=kf, scoring="accuracy")

print("K-Fold Accuracies:", scores)
print("Average Accuracy:", scores.mean())


K-Fold Accuracies: [0.905 0.874 0.882 0.898 0.903]
Average Accuracy: 0.8924
