In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("data.csv",index_col=0)
df.head()

Unnamed: 0,path,label
0,Surprise/1bd930d6a1c717c11be33db74823f661cb53f...,Surprise
1,Surprise/cropped_emotions.100096~12fffff.png,Surprise
2,Surprise/0df0e470e33093f5b72a8197fa209d684032c...,Surprise
3,Surprise/cropped_emotions.260779~12fffff.png,Surprise
4,Surprise/cropped_emotions.263616~12fffff.png,Surprise


In [3]:
df['label'].value_counts()

Neutral     4027
Sad         3934
Happy       3740
Angry       1313
Surprise    1234
Ahegao      1205
Name: label, dtype: int64

In [4]:
df=df.query("`label` in ['Angry','Surprise','Ahegao']")
df=df.reset_index(drop=True)
df['path']="dataset/"+df['path']

# Loading Dataset

In [5]:
import transformers
import torch
from torchvision import transforms
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from torchvision.datasets import ImageFolder
from torchvision import transforms

train_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((256,256))
])

# Create Dataset using ImageFolder
dataset_train = ImageFolder(
    'Data',
    transform=train_transforms,
)

In [7]:
dataset_train

Dataset ImageFolder
    Number of datapoints: 3752
    Root location: Data
    StandardTransform
Transform: Compose(
               ToTensor()
               Resize(size=(256, 256), interpolation=bilinear, max_size=None, antialias=True)
           )

In [8]:
from torch.utils.data import DataLoader
dataloader_train = DataLoader(dataset_train,shuffle=True,batch_size=100,)
image, label = next(iter(dataloader_train))
print(image.shape)

torch.Size([100, 3, 256, 256])


# Create a Deep learning Model

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the CNN architecture
class EmotionCNN(nn.Module):
    def __init__(self, num_classes=3):
        super(EmotionCNN, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        
        # Pooling layer
        self.pool = nn.MaxPool2d(2, 2)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.5)
        
        # Fully connected layers
        self.fc1 = nn.Linear(128 * 32 * 32, 512)  # Calculate based on input size
        self.fc2 = nn.Linear(512, num_classes)
        
        # Activation functions
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # Standard forward for classification
        x = self.pool(self.relu(self.conv1(x)))  # 256x256 -> 128x128
        x = self.pool(self.relu(self.conv2(x)))  # 128x128 -> 64x64
        x = self.pool(self.relu(self.conv3(x)))  # 64x64 -> 32x32
        x = x.view(x.size(0), -1)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

    def extract_features(self, x):
        # This function returns features before the final classification layer
        x = self.pool(self.relu(self.conv1(x)))  # 256x256 -> 128x128
        x = self.pool(self.relu(self.conv2(x)))  # 128x128 -> 64x64
        x = self.pool(self.relu(self.conv3(x)))  # 64x64 -> 32x32
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x

# Initialize the model
model = EmotionCNN(num_classes=len(dataset_train.classes))
print(f"Model created with {len(dataset_train.classes)} classes: {dataset_train.classes}")

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        for i, (images, labels) in enumerate(dataloader):
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            # Statistics
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            if i % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], '
                      f'Loss: {loss.item():.4f}, Accuracy: {100 * correct/total:.2f}%')
        
        print(f'Epoch [{epoch+1}/{num_epochs}] completed. '
              f'Average Loss: {running_loss/len(dataloader):.4f}, '
              f'Accuracy: {100 * correct/total:.2f}%')

# Train the model
print("Starting training...")
train_model(model, dataloader_train, criterion, optimizer, num_epochs=5)

# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in dataloader:
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')
    return accuracy

# Evaluate the model
print("\nEvaluating model...")
evaluate_model(model, dataloader_train)


Model created with 3 classes: ['Ahegao', 'Angry', 'Surprise']
Starting training...
Epoch [1/5], Step [1/38], Loss: 1.1014, Accuracy: 27.00%
Epoch [1/5], Step [11/38], Loss: 1.0370, Accuracy: 38.82%
Epoch [1/5], Step [21/38], Loss: 0.9521, Accuracy: 43.52%
Epoch [1/5], Step [31/38], Loss: 0.9490, Accuracy: 46.52%
Epoch [1/5] completed. Average Loss: 1.1364, Accuracy: 48.88%
Epoch [2/5], Step [1/38], Loss: 0.7479, Accuracy: 66.00%
Epoch [2/5], Step [11/38], Loss: 0.7434, Accuracy: 62.73%
Epoch [2/5], Step [21/38], Loss: 0.7618, Accuracy: 64.67%
Epoch [2/5], Step [31/38], Loss: 0.7452, Accuracy: 64.32%
Epoch [2/5] completed. Average Loss: 0.7790, Accuracy: 65.43%
Epoch [3/5], Step [1/38], Loss: 0.6957, Accuracy: 77.00%
Epoch [3/5], Step [11/38], Loss: 0.7118, Accuracy: 70.36%
Epoch [3/5], Step [21/38], Loss: 0.5838, Accuracy: 70.71%
Epoch [3/5], Step [31/38], Loss: 0.6508, Accuracy: 71.74%
Epoch [3/5] completed. Average Loss: 0.6707, Accuracy: 71.62%
Epoch [4/5], Step [1/38], Loss: 0.4674

81.13006396588486

## Feature extraction (FC layer)

In [10]:
# Function to generate features for a given dataloader
def generate_features(model, dataloader, device='cpu'):
    """
    Extract features from the CNN for all images in the dataloader.
    Returns:
        features: torch.Tensor of shape (N_samples, feature_dim)
        labels: torch.Tensor of shape (N_samples,)
    """
    model.eval()
    features_list = []
    labels_list = []
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            feats = model.extract_features(images)
            features_list.append(feats.cpu())
            labels_list.append(labels.cpu())
    features = torch.cat(features_list, dim=0)
    labels = torch.cat(labels_list, dim=0)
    return features, labels

features, labels = generate_features(model, dataloader_train)

In [11]:
features_np=features.numpy()
labels_np = labels.numpy()

In [12]:
Transformed_df = pd.DataFrame(
    np.column_stack([features_np, labels_np])
)

In [13]:
class_names=dataloader_train.dataset.classes
Transformed_df.iloc[:,-1]=Transformed_df.iloc[:,-1].apply(lambda x: class_names[int(x)])
Transformed_df = Transformed_df.rename(columns={Transformed_df.columns[-1]: "label"})

In [14]:
Transformed_df.to_csv("Transformed_features.csv", index=False)

## Classification

### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

# Prepare features and labels
X = Transformed_df.iloc[:, :-1].values
y = Transformed_df['label'].values

# Split into train and test sets (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

# Initialize logistic regression classifier
clf = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto')

# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))



Cross-validation accuracy scores: [0.85059172 0.83703704 0.82962963 0.84148148 0.85481481]
Mean CV accuracy: 0.8427109357878588




Test set accuracy: 0.8351063829787234
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.87      0.83      0.85       121
       Angry       0.86      0.86      0.86       131
    Surprise       0.78      0.81      0.80       124

    accuracy                           0.84       376
   macro avg       0.84      0.83      0.84       376
weighted avg       0.84      0.84      0.84       376



Best performance with lower computational cost

### Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier

# Initialize logistic regression classifier
clf = DecisionTreeClassifier()

# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))

Cross-validation accuracy scores: [0.74852071 0.75703704 0.75555556 0.75703704 0.76296296]
Mean CV accuracy: 0.7562226605303528
Test set accuracy: 0.7712765957446809
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.77      0.77      0.77       121
       Angry       0.83      0.87      0.85       131
    Surprise       0.71      0.67      0.69       124

    accuracy                           0.77       376
   macro avg       0.77      0.77      0.77       376
weighted avg       0.77      0.77      0.77       376



### Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))


Cross-validation accuracy scores: [0.84023669 0.84444444 0.84148148 0.85037037 0.84444444]
Mean CV accuracy: 0.8441954854262546
Test set accuracy: 0.848404255319149
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.84      0.85      0.85       121
       Angry       0.88      0.94      0.91       131
    Surprise       0.81      0.75      0.78       124

    accuracy                           0.85       376
   macro avg       0.85      0.85      0.85       376
weighted avg       0.85      0.85      0.85       376



### Ada Boost

In [18]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(random_state=42)
# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))




Cross-validation accuracy scores: [0.81656805 0.81481481 0.81333333 0.81037037 0.81925926]
Mean CV accuracy: 0.8148691650230111




Test set accuracy: 0.8138297872340425
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.80      0.81      0.81       121
       Angry       0.86      0.92      0.89       131
    Surprise       0.77      0.71      0.74       124

    accuracy                           0.81       376
   macro avg       0.81      0.81      0.81       376
weighted avg       0.81      0.81      0.81       376



### LDA

In [19]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Initialize logistic regression classifier
clf = LinearDiscriminantAnalysis()

# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))

Cross-validation accuracy scores: [0.82840237 0.82518519 0.83555556 0.80888889 0.8237037 ]
Mean CV accuracy: 0.8243471400394476
Test set accuracy: 0.8058510638297872
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.82      0.82      0.82       121
       Angry       0.85      0.84      0.84       131
    Surprise       0.75      0.76      0.75       124

    accuracy                           0.81       376
   macro avg       0.81      0.81      0.81       376
weighted avg       0.81      0.81      0.81       376



### QDA

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

# Prepare features and labels
X = Transformed_df.iloc[:, :-1].values
y = Transformed_df['label'].values

# Split into train and test sets (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

# Initialize logistic regression classifier
clf = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto')

# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))



Cross-validation accuracy scores: [0.85059172 0.83703704 0.82962963 0.84148148 0.85481481]
Mean CV accuracy: 0.8427109357878588




Test set accuracy: 0.8351063829787234
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.87      0.83      0.85       121
       Angry       0.86      0.86      0.86       131
    Surprise       0.78      0.81      0.80       124

    accuracy                           0.84       376
   macro avg       0.84      0.83      0.84       376
weighted avg       0.84      0.84      0.84       376



## PCA method

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Standardize the dataset (fit only on train, transform both train and test)
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Apply PCA (e.g., keep 10 principal components)
pca = PCA(n_components=32)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total explained variance:", pca.explained_variance_ratio_.sum())


Explained variance ratio: [0.47163078 0.17387935 0.09623474 0.07378868 0.04244696 0.0275414
 0.01293464 0.01259039 0.00883025 0.00778296 0.00665696 0.00535737
 0.00506648 0.00442342 0.00358005 0.00329685 0.00278813 0.00265827
 0.00250393 0.00178139 0.00165739 0.00159226 0.00144962 0.00137686
 0.00125597 0.00114795 0.0010648  0.0010196  0.00094123 0.00084383
 0.00077624 0.00074288]
Total explained variance: 0.9796416146748682


In [22]:
# Initialize logistic regression classifier
clf = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto')

# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train_pca, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train_pca, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test_pca)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))



Cross-validation accuracy scores: [0.82248521 0.82518519 0.81037037 0.82518519 0.82962963]
Mean CV accuracy: 0.8225711154941925
Test set accuracy: 0.8164893617021277
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.84      0.79      0.82       121
       Angry       0.86      0.89      0.88       131
    Surprise       0.75      0.76      0.75       124

    accuracy                           0.82       376
   macro avg       0.82      0.81      0.82       376
weighted avg       0.82      0.82      0.82       376





Increase test set performance but decrease in cross-validation performance

## Backward Selection

In [23]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

# Use the same train/test split as above
# X_train, X_test, y_train, y_test already defined

# Initialize logistic regression classifier for feature selection
selector_clf = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto')

# Perform backward selection (elimination) using SequentialFeatureSelector
# n_features_to_select can be set as desired, e.g., 20, or 'auto' for automatic selection
sfs = SequentialFeatureSelector(
    selector_clf,
    n_features_to_select='auto',  # or set to a specific number, e.g., 20
    direction='backward',
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)
sfs.fit(X_train_pca, y_train)

# Get the mask of selected features
selected_features_mask = sfs.get_support()
selected_features_indices = [i for i, x in enumerate(selected_features_mask) if x]
print("Selected feature indices:", selected_features_indices)
print("Number of selected features:", sum(selected_features_mask))

# Transform train and test sets to selected features
X_train_selected = sfs.transform(X_train_pca)
X_test_selected = sfs.transform(X_test_pca)

# Retrain classifier on selected features
clf_selected = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto')
clf_selected.fit(X_train_selected, y_train)

# Evaluate on test set
y_pred_selected = clf_selected.predict(X_test_selected)
test_acc_selected = accuracy_score(y_test, y_pred_selected)
print("Test set accuracy with selected features:", test_acc_selected)
print("Classification report with selected features:\n", classification_report(y_test, y_pred_selected))


Selected feature indices: [0, 1, 2, 3, 4, 5, 7, 9, 10, 11, 13, 14, 19, 27, 28, 31]
Number of selected features: 16
Test set accuracy with selected features: 0.8138297872340425
Classification report with selected features:
               precision    recall  f1-score   support

      Ahegao       0.83      0.77      0.80       121
       Angry       0.87      0.89      0.88       131
    Surprise       0.74      0.77      0.76       124

    accuracy                           0.81       376
   macro avg       0.81      0.81      0.81       376
weighted avg       0.81      0.81      0.81       376





Increase in accuracy but decrease in F1-score

# Extract the Feature with GAP method

In [24]:
class Emotion_extractor(nn.Module):
    def __init__(self):
        super().__init__()
        self.feature_extractor = nn.Sequential(
            # Layer 1: Conv -> BN -> ELU -> Pool
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ELU(),
            nn.MaxPool2d(kernel_size=2),
            
            # Layer 2: Conv -> BN -> ELU -> Pool
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ELU(),
            nn.MaxPool2d(kernel_size=2),
            
            # Layer 3: Conv -> BN -> ELU -> Pool
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ELU(),
            nn.MaxPool2d(kernel_size=2),

            nn.AdaptiveAvgPool2d(1),
            nn.Flatten()
        )

    def forward(self, x):
        x = self.feature_extractor(x)       
        return x

In [25]:
dataloader = DataLoader(dataset_train,shuffle=False,batch_size=100,)


In [26]:
# Initialize model
ext_model = Emotion_extractor()
ext_model.eval()

all_features = []
all_labels = []

# Extract features for a batch
for images, labels in dataloader:
    with torch.no_grad():
        features = ext_model(images)  # Forward pass
        all_features.append(features)  # Store batch features
        all_labels.append(labels)      # Store corresponding labels
# Concatenate all batches into single tensors
all_features = torch.cat(all_features, dim=0)  # Shape: [N_samples, feature_dim]
all_labels = torch.cat(all_labels, dim=0)      # Shape: [N_samples]

# Verify shapes
print(f"All features shape: {all_features.shape}")  # [Num_samples, Feature_dim]
print(f"All labels shape: {all_labels.shape}")      # [Num_samples]

All features shape: torch.Size([3752, 128])
All labels shape: torch.Size([3752])


In [27]:
features_np=all_features.numpy()
all_labels_np = all_labels.numpy()

In [28]:
Transformed_df = pd.DataFrame(
    np.column_stack([features_np, all_labels_np])
)

In [29]:
Transformed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,128
0,0.117085,0.009221,-0.079122,-0.119097,-0.025123,-0.186836,-0.100694,-0.136923,0.263568,-0.057435,...,-0.104499,-0.076613,0.060868,-0.023425,0.210154,-0.094395,-0.049257,-0.108229,-0.013878,0.0
1,0.11285,0.007841,-0.078589,-0.109645,-0.025417,-0.176733,-0.099483,-0.127217,0.250456,-0.049777,...,-0.097252,-0.073444,0.057944,-0.024694,0.206665,-0.089991,-0.037248,-0.100587,-0.01476,0.0
2,0.103204,0.001466,-0.073323,-0.091821,-0.040404,-0.144515,-0.104588,-0.113104,0.208657,-0.030124,...,-0.080252,-0.067065,0.032545,-0.030524,0.177653,-0.070659,-0.011211,-0.066614,-0.004603,0.0
3,0.119433,0.011938,-0.06038,-0.097906,-0.043307,-0.157708,-0.100221,-0.112822,0.240332,-0.042732,...,-0.090594,-0.067008,0.045575,-0.023394,0.196152,-0.069427,-0.036742,-0.073096,0.000747,0.0
4,0.116311,0.009666,-0.056809,-0.088272,-0.050252,-0.139493,-0.097323,-0.105374,0.227273,-0.039146,...,-0.081966,-0.059505,0.030693,-0.025518,0.179096,-0.061821,-0.027747,-0.056479,0.008409,0.0


In [30]:
class_names=dataloader.dataset.classes

In [31]:
Transformed_df.iloc[:,-1]=Transformed_df.iloc[:,-1].apply(lambda x: class_names[int(x)])
Transformed_df = Transformed_df.rename(columns={Transformed_df.columns[-1]: "label"})

In [32]:
Transformed_df.to_csv("Transformed_features2.csv", index=False)

## Using classifier

Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

# Prepare features and labels
X = Transformed_df.iloc[:, :-1].values
y = Transformed_df['label'].values

# Split into train and test sets (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

# Initialize logistic regression classifier
clf = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto')

# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))



Cross-validation accuracy scores: [0.49260355 0.52740741 0.51851852 0.52740741 0.50518519]
Mean CV accuracy: 0.5142244137628753
Test set accuracy: 0.5212765957446809
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.58      0.68      0.63       121
       Angry       0.49      0.63      0.55       131
    Surprise       0.48      0.26      0.34       124

    accuracy                           0.52       376
   macro avg       0.52      0.52      0.50       376
weighted avg       0.51      0.52      0.50       376





Decision Tree


In [34]:
from sklearn.tree import DecisionTreeClassifier

# Initialize logistic regression classifier
clf = DecisionTreeClassifier()

# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))

Cross-validation accuracy scores: [0.48668639 0.51111111 0.45481481 0.4962963  0.44888889]
Mean CV accuracy: 0.47955950032873107
Test set accuracy: 0.45478723404255317
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.53      0.50      0.51       121
       Angry       0.47      0.46      0.47       131
    Surprise       0.38      0.40      0.39       124

    accuracy                           0.45       376
   macro avg       0.46      0.46      0.46       376
weighted avg       0.46      0.45      0.46       376



LDA

In [35]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Initialize logistic regression classifier
clf = LinearDiscriminantAnalysis()

# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))

Cross-validation accuracy scores: [0.70266272 0.72888889 0.73333333 0.7362963  0.73481481]
Mean CV accuracy: 0.7271992110453649
Test set accuracy: 0.7180851063829787
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.82      0.79      0.80       121
       Angry       0.70      0.78      0.74       131
    Surprise       0.63      0.59      0.61       124

    accuracy                           0.72       376
   macro avg       0.72      0.72      0.72       376
weighted avg       0.72      0.72      0.72       376



Best performance

QDA

In [36]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clf = QuadraticDiscriminantAnalysis()
# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))

Cross-validation accuracy scores: [0.64201183 0.66814815 0.62962963 0.63703704 0.67703704]
Mean CV accuracy: 0.6507727372342756
Test set accuracy: 0.625
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.73      0.67      0.70       121
       Angry       0.71      0.57      0.63       131
    Surprise       0.50      0.64      0.56       124

    accuracy                           0.62       376
   macro avg       0.64      0.63      0.63       376
weighted avg       0.65      0.62      0.63       376



Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))


Cross-validation accuracy scores: [0.52218935 0.56       0.57481481 0.55555556 0.55703704]
Mean CV accuracy: 0.5539193513039666
Test set accuracy: 0.5585106382978723
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.61      0.65      0.63       121
       Angry       0.54      0.63      0.58       131
    Surprise       0.52      0.39      0.44       124

    accuracy                           0.56       376
   macro avg       0.56      0.56      0.55       376
weighted avg       0.56      0.56      0.55       376



Ada boost

In [38]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(random_state=42)
# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))




Cross-validation accuracy scores: [0.52810651 0.53925926 0.55407407 0.54814815 0.52592593]
Mean CV accuracy: 0.5391027832566294




Test set accuracy: 0.5425531914893617
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.56      0.66      0.61       121
       Angry       0.55      0.59      0.57       131
    Surprise       0.51      0.38      0.43       124

    accuracy                           0.54       376
   macro avg       0.54      0.54      0.54       376
weighted avg       0.54      0.54      0.54       376



Gradient boost

In [39]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=42)
# Perform cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fit on the full training set
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_acc)
print("Classification report:\n", classification_report(y_test, y_pred))


Cross-validation accuracy scores: [0.53550296 0.60296296 0.58666667 0.5762963  0.5762963 ]
Mean CV accuracy: 0.5755450361604207
Test set accuracy: 0.5531914893617021
Classification report:
               precision    recall  f1-score   support

      Ahegao       0.59      0.65      0.62       121
       Angry       0.55      0.60      0.57       131
    Surprise       0.50      0.41      0.45       124

    accuracy                           0.55       376
   macro avg       0.55      0.55      0.55       376
weighted avg       0.55      0.55      0.55       376

