In [2]:
! pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.11.0 imblearn-0.0


In [18]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE #for balanced

# For reproducibility of results
np.random.seed(0)

# Load the dataset
file_path = 'rtfDataSet.csv'
original_data = pd.read_csv(file_path)

# Display first few rows of the original dataset
original_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,label
0,-0.536,0.292,-0.684,0.123,-0.118,0.346,-0.308,-0.113,0.401,-0.399,...,-0.135,-0.063,-0.41,-0.223,-0.599,-0.136,-0.329,-0.132,-0.266,0
1,-0.496,-0.298,-0.529,0.073,-0.072,0.107,-0.316,-0.066,-0.168,-0.099,...,0.189,0.007,-0.362,-0.151,-0.338,-0.031,-0.159,-0.097,-0.131,0
2,0.008,-0.031,-0.24,0.178,-0.12,0.317,-0.481,0.031,-0.077,0.063,...,-0.035,0.101,-0.098,-0.172,-0.29,0.033,-0.342,-0.321,-0.238,0
3,-0.188,-0.18,-0.062,-0.104,-0.136,-0.061,-0.216,-0.143,0.068,-0.189,...,0.146,-0.145,-0.527,-0.292,-0.663,-0.078,-0.194,-0.151,-0.268,0
4,0.038,0.155,-0.203,-0.088,-0.084,-0.164,-0.145,-0.168,0.008,0.143,...,0.019,-0.084,0.164,-0.165,-0.126,-0.112,0.029,-0.138,0.148,1


In [19]:
# Normalizing the data using Min-Max scaling
scaler = MinMaxScaler()
normalized_data = pd.DataFrame(scaler.fit_transform(original_data.iloc[:,:-1]), columns=original_data.columns[:-1]) # Normalize all data except for the last column (label)
normalized_data['label'] = original_data['label']

save_csv = 'normalized_dataset.csv'
normalized_data.to_csv(save_csv, index=False)

# # argmax and argmin of normalized_data
# normalized_data.idxmax(), normalized_data.idxmin()
# normalized_data['label'][198]

normalized_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,label
0,0.294244,0.705043,0.183047,0.610883,0.28757,0.806475,0.179009,0.499009,0.890763,0.184259,...,0.378264,0.459459,0.300824,0.339323,0.180758,0.348815,0.324151,0.526825,0.128294,0
1,0.315761,0.143673,0.263651,0.559548,0.330241,0.630611,0.174344,0.522299,0.433735,0.462037,...,0.606916,0.52953,0.333791,0.389081,0.332945,0.448341,0.433056,0.543741,0.221914,0
2,0.586875,0.397716,0.413937,0.667351,0.285714,0.785136,0.078134,0.570367,0.506827,0.612037,...,0.448836,0.623624,0.51511,0.374568,0.360933,0.509005,0.315823,0.435476,0.147712,0
3,0.481442,0.255947,0.5065,0.377823,0.270872,0.50699,0.232653,0.484143,0.623293,0.378704,...,0.57657,0.377377,0.220467,0.291638,0.14344,0.403791,0.410634,0.517641,0.126907,0
4,0.603012,0.574691,0.433177,0.394251,0.319109,0.431199,0.274052,0.471754,0.5751,0.686111,...,0.486944,0.438438,0.695055,0.379406,0.45656,0.371564,0.553491,0.523925,0.415395,1


In [20]:
# Load the normalized dataset (This can be adjusted to the working dataset in case there is some matching problems)
file_path = 'normalized_dataset.csv'
data = pd.read_csv(file_path)

In [21]:
# Separate the features (X) and labels (y)
x = data.drop(columns=['label'])
y = data['label']

# Apply SMOTE to balance the dataset // Great job using a seed
smote = SMOTE(random_state=42) 
x_balanced, y_balanced = smote.fit_resample(x, y)

# Standardize the features
scaler = StandardScaler()
x_balanced = scaler.fit_transform(x_balanced)

In [22]:
# Define the number of iterations for Simulated Annealing
n_iterations = 2000

# Initialize the best solution and its score
best_solution = None
best_score = 0

# Initialize the current solution and its score
current_solution = pd.DataFrame(x_balanced, columns=x.columns)  # Use the balanced dataset
current_score = 0

# Define an initial temperature and cooling rate
T_initial = 4.0  # Initial temperature
alpha = 0.95  # Cooling rate

# Define a small constant to avoid division by zero in the SU function
epsilon = 1e-10

# Symmetrical Uncertainty function
def symmetrical_uncertainty(x, y, epsilon):
    mi = mutual_info_classif(x, y)
    h_x = -np.sum(np.log2((x.sum() + epsilon) / (X.sum().sum() + epsilon)))
    su = 2 * np.sum(mi) / h_x
    return su

In [23]:

# Initialize a list to store the importance scores for each subset
subset_importance_scores = []

# Simulated Annealing loop (this code could be improved)
for iteration in range(n_iterations):
    # Make a small random change to the current solution (feature selection)
    feature_to_change = np.random.choice(current_solution.columns)
    current_solution[feature_to_change] = np.random.choice([0, 1])
    
    # Calculate the Symmetrical Uncertainty score for the current solution
    current_score = symmetrical_uncertainty(current_solution, y_balanced, epsilon=epsilon)
    
    # Calculate the change in score
    delta_score = current_score - best_score
    
    # Accept the new solution with a certain probability based on the temperature
    if delta_score > 0 or np.random.uniform() < np.exp(delta_score / T_initial):
        best_solution = current_solution.copy()
        best_score = current_score
    
    # Update the temperature using the cooling rate
    T_initial *= alpha
    
    # Calculate the importance of the selected features in the current solution
    selected_features = current_solution.columns[current_solution.sum() > 0]
    importance_per_capita = current_score / len(selected_features)
    subset_importance_scores.append(importance_per_capita)

In [None]:
# Print the selected features and their Symmetrical Uncertainty score
selected_features = best_solution.columns[best_solution.sum() > 0]
print("Selected Features:")
print(selected_features)
print("Symmetrical Uncertainty Score:", best_score)

# Calculate the importance per capita for the best solution
best_importance_per_capita = best_score / len(selected_features)
print("Importance Per Capita:", best_importance_per_capita)

# Plot the importance scores over iterations (optional)
import matplotlib.pyplot as plt
plt.plot(range(n_iterations), subset_importance_scores)
plt.xlabel("Iteration")
plt.ylabel("Importance Per Capita")
plt.show()

In [None]:
# TODO: 3. Top Five Feature Subsets

# Use the feature selection method to identify the top five feature subsets
# Calculate the per capita importance for each subset

In [None]:

# TODO: 4. AUC Score-based Ranking

# Use a classification model (e.g., logistic regression, SVM, etc.) to evaluate the feature subsets
# Rank the top five feature subsets based on their AUC scores

In [None]:

# TODO: 5. Interpretation and Conclusion

# Interpret the per capita importance and the AUC scores
# Conclude the study, noting any limitations and suggesting future work