In [1]:
# %pip install scikit-learn

from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Initialize list to store misclassified items
misclassified_items = []

# Initialize lists to store error for each fold
iteration_errors = []

# Initialize 10-fold cross-validation
kf = KFold(n_splits=3, shuffle=True)

In [3]:
# Replace 'file_path.csv' with the path to your CSV file
# file_path = 'elbow_avg_output_sample0_raw.csv'
file_path = 'elbow_avg_output_sample0_raw_best.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

X_train = df.iloc[:,3:].copy()
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
y_train = df[['1.1']].copy()
# y_train['1.1'] = y_train['1.1'].replace({'True': 1, 'False': 0})
print(y_train.dtypes)

KeyError: "None of [Index(['1.1'], dtype='object')] are in the [columns]"

In [None]:
print(df.shape)
print(X_train.shape)
print(y_train.shape)

In [None]:
# Create a new figure
plt.figure()
    
for fold, (train_index, test_index) in enumerate(kf.split(X_train)):
    # Split data into training and test sets for this fold
    X_train_fold, X_test_fold = X_train.iloc[train_index,:], X_train.iloc[test_index,:]
    y_train_fold, y_test_fold = y_train.iloc[train_index,:], y_train.iloc[test_index,:]
    
    X_train_fold = X_train_fold.values; X_test_fold = X_test_fold.values
    y_train_fold = y_train_fold.values; y_test_fold = y_test_fold.values
    
    # Train neural network classifier
    clf = MLPClassifier(hidden_layer_sizes=(10,), max_iter=2000)
    clf.fit(X_train_fold, y_train_fold)
    
    # Get error across iterations
    iteration_errors_fold = clf.loss_curve_
    iteration_errors.append(iteration_errors_fold)
    
    # Evaluate model on test set
    y_pred_fold = clf.predict(X_test_fold)
    
    # Compute accuracy for this fold
    accuracy_fold = accuracy_score(y_test_fold, y_pred_fold)
    print(f'Accuracy for fold: {accuracy_fold}')
    
    # Identify misclassified items
    misclassified_indices = (y_pred_fold != np.concatenate(y_test_fold).ravel())
    # misclassified_items_fold = X_test_fold[misclassified_indices]
    
    # Aggregate misclassified items
    misclassified_items.extend(df.iloc[test_index[misclassified_indices],0].values)
    
    # Plot error across iterations for this fold
    plt.plot(iteration_errors_fold, label=f'Fold {fold+1}')
    
# Plot settings
plt.title('Error Across Iterations for Each Fold')
plt.xlabel('Iteration')
plt.ylabel('Error')
plt.legend()
plt.show()
plt.savefig('classification_error.png')

In [None]:
print(f'{len(misclassified_items)} misclassified items.')
print(f'%{len(set(misclassified_items))/len(df)} of data is misclassified.')
mis = np.sort(misclassified_items)
print(mis)

In [None]:
file_path = 'language_translate_fw_output_sample0.csv'
df_fw = pd.read_csv(file_path)

file_path = 'language_translate_bw_output_sample0.csv'
df_bw = pd.read_csv(file_path)

df_concatenated = pd.concat([df_fw, df_bw], axis=1).iloc[:, [0, 1, 2, 3, 4, 5, 11]]
df_filtered = df_concatenated[df_concatenated['0'].isin(mis)]
df_filtered.to_csv('misclassified_all.csv', encoding='utf-8', index=False)

df_filtered_fa = df_filtered[df_filtered['2'] == 'fa']
df_filtered_fa.to_csv('misclassified_persian.csv', encoding='utf-8', index=False)

df_filtered_it = df_filtered[df_filtered['2'] == 'it']
df_filtered_it.to_csv('misclassified_italian.csv', encoding='utf-8', index=False)