# BASELINE

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import sys
import os
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader,TensorDataset
from sklearn.model_selection import KFold

sys.path.append(os.path.abspath('../src'))

In [2]:
from preprocessor import normalize_data


In [3]:
df = pd.read_csv("./../data/train.csv")
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
y = df["label"]
X = df.drop(columns=["label"])

In [5]:
X_normalized = normalize_data(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_normalized,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

## Logistic Regression

In [6]:
log_reg = LogisticRegression(solver='saga',max_iter=1000)

log_reg.fit(X_train,y_train)

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)

print(accuracy)

0.9128571428571428


## Random Forest

In [12]:
rf_model = RandomForestClassifier()

rf_model.fit(X_train,y_train)

rf_y_pred = rf_model.predict(X_test)


rf_accuracy = accuracy_score(y_test,rf_y_pred)

print(rf_accuracy)

0.9644047619047619


## 5 fold cross validation

In [8]:
rf_scores = cross_val_score(rf_model,X_train,y_train,cv=5)
lr_scores = cross_val_score(log_reg,X_train,y_train,cv=5)


print("rf scores :", rf_scores)
print("lf scores :", lr_scores)

rf scores : [0.95997024 0.9641369  0.95982143 0.959375   0.96577381]
lf scores : [0.91949405 0.91949405 0.90803571 0.91339286 0.91651786]


## Mean and Variance

In [9]:
mean_rf = np.mean(rf_scores)
std_rf = np.std(rf_scores)


mean_lr = np.mean(lr_scores)
std_lr = np.std(lr_scores)


print(f"Mean random forst: {mean_rf} Mean logistic regrerssion: {mean_lr}")
print(f"Standard Deviation random forest: {std_rf} Standard Deviation Logistic Regression: {std_lr}")

Mean random forst: 0.9618154761904762 Mean logistic regrerssion: 0.9153869047619047
Standard Deviation random forest: 0.002622765218674795 Standard Deviation Logistic Regression: 0.004313525831850424


## Neural Networks

In [6]:
X_train_tensor = torch.tensor(X_train.values,dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values,dtype=torch.long)

#checks shapes
print(f"X shape: {X_train_tensor.shape}")
print(f"y shape: {y_train_tensor.shape}")


X shape: torch.Size([33600, 784])
y shape: torch.Size([33600])


In [15]:
class MNISTNet(nn.Module):
    def __init__(self,input_size):
        super(MNISTNet,self).__init__()
        #layer 1 -> takes 9 inputs and outputs t a hidden layer(e.g 16 neurons)
        
        self.fc1 = nn.Linear(input_size,128)
        
        self.relu = nn.ReLU()
        
        self.dropout = nn.Dropout(p=0.3)
        self.fc2 = nn.Linear(128,64)
        
        self.fc3 = nn.Linear(64,128)
        
        self.fc4 = nn.Linear(128,10)

        
        
        
        #self.softmax = nn.Softmax()
        
        
    def forward(self,x):
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.fc4(x)
        #x = self.softmax(x)
        
        return x
    
    
        

In [8]:
model = MNISTNet(input_size=784)    

criterion = nn.CrossEntropyLoss()

#define the optimizer (Learning rate is a knob we can tune later!)
optimizer = optim.Adam(model.parameters(),lr=0.0001) 


# Weight decay
schedular = optim.lr_scheduler.StepLR(optimizer,step_size=40,gamma=0.8)


In [9]:
train_dataset = TensorDataset(X_train_tensor,y_train_tensor)


train_loader = DataLoader(dataset=train_dataset,batch_size=32,shuffle=True)

In [39]:
epochs = 100


for epoch in range(epochs):
    for images ,labels in train_loader:
        
        optimizer.zero_grad()
        
        
        outputs = model(images)
        
        
        loss = criterion(outputs,labels)
        
        
        loss.backward()
        
        
        optimizer.step()
    schedular.step()

    if (epoch + 1)  % 10 == 0:
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")


Epoch [10/100], Loss: 0.2513
Epoch [20/100], Loss: 0.2865
Epoch [30/100], Loss: 0.0555
Epoch [40/100], Loss: 0.0511
Epoch [50/100], Loss: 0.1128
Epoch [60/100], Loss: 0.0101
Epoch [70/100], Loss: 0.0056
Epoch [80/100], Loss: 0.0158
Epoch [90/100], Loss: 0.0014
Epoch [100/100], Loss: 0.0772


In [40]:
with torch.no_grad():
    
    
    raw_outputs = model(X_train_tensor)
    
    predictions = torch.argmax(raw_outputs,dim=1)
    
    
# print(predictions.shape)
# print(y_train_tensor.shape)
    
correct_maske = predictions == y_train_tensor

num_correct = correct_maske.sum().item()
print(f"Number of correct guesses: {num_correct}")


#optionally , compute accuracy
accuracy = num_correct /len(y_train_tensor)
print(f"accuracy: {accuracy:.2%}")


Number of correct guesses: 33430
accuracy: 99.49%


## 5 fold cross validation

In [41]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)

fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_tensor)):
    
    model = MNISTNet(input_size=784)
    
    criterion = nn.CrossEntropyLoss()


    #define the optimizer (Learning rate is a knob we can tune later!)
    optimizer = optim.Adam(model.parameters(),lr=0.0001) 
    
    
    #2. Slice the tensors for this fold
    X_train_fold = X_train_tensor[train_idx]
    y_train_fold = y_train_tensor[train_idx]
    X_val_fold = X_train_tensor[val_idx]
    y_val_fold = y_train_tensor[val_idx]
    
    train_dataset = TensorDataset(X_train_fold,y_train_fold)
    
    train_loader  = DataLoader(dataset=train_dataset,batch_size=32,shuffle=True)
    
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.8)
    
    for epoch in range(50):
        for images,labels in train_loader:
    
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()
        scheduler.step()
        
        current_lr = optimizer.param_groups[0]['lr']
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/100], Loss: {loss.item():.4f}, LR: {current_lr:.6f}")
        
        
        
    #validation phase (no tracking here)
    
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_fold)
        val_preds = torch.argmax(val_outputs,dim=1)
        
        
        correct = (val_preds == y_val_fold).sum().item()
        
        fold_acc = correct/len(y_val_fold)
        
        
        fold_results.append(fold_acc)
        print(f"Fold {fold+1}  Accuracy: {fold_acc:.4f}")
    
    
    
    
    
    

Epoch [50/100], Loss: 0.0417, LR: 0.000080
Fold 1  Accuracy: 0.9737
Epoch [50/100], Loss: 0.0472, LR: 0.000080
Fold 2  Accuracy: 0.9686
Epoch [50/100], Loss: 0.0209, LR: 0.000080
Fold 3  Accuracy: 0.9705
Epoch [50/100], Loss: 0.3071, LR: 0.000080
Fold 4  Accuracy: 0.9707
Epoch [50/100], Loss: 0.0050, LR: 0.000080
Fold 5  Accuracy: 0.9714


In [6]:
df_test = pd.read_csv("./../data/test.csv")
df_test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X_normalized_test = normalize_data(df_test)

### Random Forest Test

In [13]:
rf_y_test_predict = rf_model.predict(X_normalized_test)

In [16]:

submission = pd.DataFrame({
    "ImageId":range(1,len(rf_y_test_predict) + 1),
    "Label": rf_y_test_predict
})
submission.to_csv("./../data/submission.csv", index=False)


### Neural Networks test

In [31]:



X_test_tensor = torch.tensor(X_test.values,dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values,dtype=torch.long)
X_test_tensor_normalized = torch.tensor(X_normalized_test.values,dtype=torch.float32)

#checks shapes
print(f"X shape: {X_test_tensor.shape}")
print(f"y shape: {y_test_tensor.shape}")
print(f"X test shape: {X_test_tensor_normalized.shape}")


X shape: torch.Size([8400, 784])
y shape: torch.Size([8400])
X test shape: torch.Size([28000, 784])


In [32]:
with torch.no_grad():
    
    
    raw_outputs = model(X_test_tensor)
    
    predictions = torch.argmax(raw_outputs,dim=1)
    
    
# print(predictions.shape)
# print(y_train_tensor.shape)
    
correct_maske = predictions == y_test_tensor

num_correct = correct_maske.sum().item()
print(f"Number of correct guesses: {num_correct}")


#optionally , compute accuracy
accuracy = num_correct /len(y_test_tensor)
print(f"accuracy: {accuracy:.2%}")


Number of correct guesses: 8150
accuracy: 97.02%


In [None]:
with torch.no_grad():
    
    
    raw_outputs = model(X_test_tensor_normalized)
    
    predictions = torch.argmax(raw_outputs,dim=1)
    
    
# print(predictions.shape)
# print(y_train_tensor.shape)
    
pred_list = predictions.numpy()

# Create the DataFrame
# Kaggle's ImageId is 1-indexed
submission = pd.DataFrame({
    "ImageId": range(1, len(pred_list) + 1),
    "Label": pred_list
})

# Save to CSV
submission.to_csv("./../data/submission.csv", index=False)


## Feature Refinement

In [10]:
#variance tells us how much a pixel "spreads out" from its average value.
pixel_variances = torch.var(X_train_tensor,dim=0)

In [11]:
#Dimensionality Reduction
mask = pixel_variances > 0.001



In [16]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)

fold_results = []

    
X_train_reduced = X_train_tensor[:,mask]

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_reduced)):
    
    model = MNISTNet(input_size=546)
    
    criterion = nn.CrossEntropyLoss()


    #define the optimizer (Learning rate is a knob we can tune later!)
    optimizer = optim.Adam(model.parameters(),lr=0.0001) 
    
    
    #2. Slice the tensors for this fold
    X_train_fold = X_train_reduced[train_idx]
    y_train_fold = y_train_tensor[train_idx]
    X_val_fold = X_train_reduced[val_idx]
    y_val_fold = y_train_tensor[val_idx]
    
    #X_val_reduced = X_val_fold[:,mask]


    
    #print(X_train_reduced.shape)
    
    train_dataset = TensorDataset(X_train_fold,y_train_fold)
    
    train_loader  = DataLoader(dataset=train_dataset,batch_size=32,shuffle=True)
    
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.8)
    
    
    for epoch in range(50):
        model.train()

        for images,labels in train_loader:
    
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()
        scheduler.step()
        
        current_lr = optimizer.param_groups[0]['lr']
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/100], Loss: {loss.item():.4f}, LR: {current_lr:.6f}")
        
        
        
    #validation phase (no tracking here)
    
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_fold)
        val_preds = torch.argmax(val_outputs,dim=1)
        
        
        correct = (val_preds == y_val_fold).sum().item()
        
        fold_acc = correct/len(y_val_fold)
        
        
        fold_results.append(fold_acc)
        print(f"Fold {fold+1}  Accuracy: {fold_acc:.4f}")
    
    
    
    
    
    

Epoch [50/100], Loss: 0.0278, LR: 0.000080
Fold 1  Accuracy: 0.9750
Epoch [50/100], Loss: 0.0172, LR: 0.000080
Fold 2  Accuracy: 0.9695
Epoch [50/100], Loss: 0.0112, LR: 0.000080
Fold 3  Accuracy: 0.9766
Epoch [50/100], Loss: 0.0025, LR: 0.000080
Fold 4  Accuracy: 0.9743
Epoch [50/100], Loss: 0.1095, LR: 0.000080
Fold 5  Accuracy: 0.9751
