# BASELINE

In [28]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import sys
import os
import matplotlib.pyplot as plt
from pathlib import Path
from preprocessor import normalize_data
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader,TensorDataset

sys.path.append(os.path.abspath('../src'))

In [23]:
df = pd.read_csv("./../data/train.csv")
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
y = df["label"]
X = df.drop(columns=["label"])

In [25]:
X_normalized = normalize_data(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_normalized,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

## Logistic Regression

In [None]:
log_reg = LogisticRegression(multi_class='multinomial',solver='saga',max_iter=1000)

log_reg.fit(X_train,y_train)

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)

print(accuracy)

0.9129761904761905


## Random Forest

In [17]:
rf_model = RandomForestClassifier()

rf_model.fit(X_train,y_train)

rf_y_pred = rf_model.predict(X_test)


rf_accuracy = accuracy_score(y_test,rf_y_pred)

print(rf_accuracy)

0.9638095238095238


## 5 fold cross validation

In [18]:
rf_scores = cross_val_score(rf_model,X_train,y_train,cv=5)
lr_scores = cross_val_score(log_reg,X_train,y_train,cv=5)


print("rf scores :", rf_scores)
print("lf scores :", lr_scores)

rf scores : [0.95907738 0.96458333 0.9610119  0.96011905 0.96369048]
lf scores : [0.91875    0.91964286 0.90892857 0.91309524 0.91636905]


## Mean and Variance

In [19]:
mean_rf = np.mean(rf_scores)
std_rf = np.std(rf_scores)


mean_lr = np.mean(lr_scores)
std_lr = np.std(lr_scores)


print(f"Mean random forst: {mean_rf} Mean logistic regrerssion: {mean_lr}")
print(f"Standard Deviation random forest: {std_rf} Standard Deviation Logistic Regression: {std_lr}")

Mean random forst: 0.9616964285714286 Mean logistic regrerssion: 0.9153571428571429
Standard Deviation random forest: 0.0021036425056038506 Standard Deviation Logistic Regression: 0.003933078223019459


## Neural Networks

In [30]:
X_train_tensor = torch.tensor(X_train.values,dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values,dtype=torch.long)

#checks shapes
print(f"X shape: {X_train_tensor.shape}")
print(f"y shape: {y_train_tensor.shape}")


X shape: torch.Size([33600, 784])
y shape: torch.Size([33600])


In [32]:
class MNISTNet(nn.Module):
    def __init__(self,input_size):
        super(MNISTNet,self).__init__()
        #layer 1 -> takes 9 inputs and outputs to a hidden layer(e.g 16 neurons)
        
        self.fc1 = nn.Linear(input_size,12)
        
        self.relu = nn.ReLU()
        
        self.dropout = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(12,10)
        
        #self.softmax = nn.Softmax()
        
        
    def forward(self,x):
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        #x = self.softmax(x)
        
        return x
    
    
        

In [33]:
model = MNISTNet(input_size=784)    

criterion = nn.CrossEntropyLoss()

#define the optimizer (Learning rate is a knob we can tune later!)
optimizer = optim.Adam(model.parameters(),lr=0.01) 


In [34]:
train_dataset = TensorDataset(X_train_tensor,y_train_tensor)


train_loader = DataLoader(dataset=train_dataset,batch_size=32,shuffle=True)

In [37]:
epochs = 100


for epoch in range(epochs):
    for images ,labels in train_loader:
        
        optimizer.zero_grad()
        
        
        outputs = model(images)
        
        
        loss = criterion(outputs,labels)
        
        
        loss.backward()
        
        
        optimizer.step()
    if (epoch + 1)  % 10 == 0:
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")


Epoch [10/100], Loss: 0.2544
Epoch [20/100], Loss: 1.1656
Epoch [30/100], Loss: 0.4645
Epoch [40/100], Loss: 0.6617
Epoch [50/100], Loss: 0.2721
Epoch [60/100], Loss: 0.1896
Epoch [70/100], Loss: 0.0397
Epoch [80/100], Loss: 0.4951
Epoch [90/100], Loss: 0.7702
Epoch [100/100], Loss: 0.3566
