In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

Mounted at /content/drive


In [None]:
toy_df = pd.read_csv("/content/drive/Shareddrives/ML./project/SPECTF.train", header=None)
toy_df.shape

(80, 45)

In [None]:
X = toy_df.drop(columns=[0])
y = toy_df[0]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
import numpy as np

def algo1(X, y, batches = 40, random_state_change = 0):
  F = np.zeros(X.shape[1])
  for i in range(batches):
    X_train, _, y_train, _ = train_test_split(X, y, train_size=0.9, random_state=i + random_state_change)
    estimator = LinearSVC(dual=False, C=1.0)
    selector = RFE(estimator, n_features_to_select=10, step=0.99, verbose=False)
    selector = selector.fit(X_train, y_train)
    F += selector.ranking_
  return F

for i in range(3):
  print(f"Iteration {i + 1} of algo1 on toy example, feature scores:\n{algo1(X,y, random_state_change=i*10)}")

Iteration 1 of algo1 on toy example, feature scores:
[67. 80. 74. 77. 79. 56. 76. 79. 80. 60. 76. 78. 40. 76. 41. 77. 78. 57.
 80. 72. 80. 78. 74. 75. 57. 67. 73. 74. 71. 73. 78. 80. 75. 72. 53. 43.
 77. 80. 79. 48. 79. 74. 80. 77.]
Iteration 2 of algo1 on toy example, feature scores:
[65. 80. 78. 78. 79. 57. 76. 79. 80. 61. 75. 79. 41. 76. 41. 76. 77. 58.
 80. 73. 78. 76. 73. 76. 56. 63. 73. 71. 75. 72. 78. 79. 73. 75. 55. 43.
 76. 80. 79. 48. 80. 75. 80. 77.]
Iteration 3 of algo1 on toy example, feature scores:
[61. 80. 78. 79. 79. 58. 75. 79. 80. 65. 75. 79. 42. 77. 41. 75. 77. 54.
 80. 73. 77. 77. 76. 79. 56. 62. 74. 69. 74. 72. 78. 79. 73. 74. 58. 42.
 76. 80. 79. 48. 79. 75. 80. 76.]


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

def pca_scores(X):
  pca = PCA(n_components=1)
  pca.fit(X)
  components = abs(pca.components_[0])
  selected_features =[x for _,x in sorted(zip(components, range(1, len(components) + 1)), reverse=True)]
  return selected_features
  
def DTScore(X, y):
  clf = DecisionTreeClassifier(random_state=42)
  clf = clf.fit(X, y)
  
  selected_features =[x for _,x in sorted(zip(clf.feature_importances_, range(1, len(clf.feature_importances_) + 1)), reverse=True)]
  return selected_features

def algo2(X, y):
  rank_1 = pca_scores(X)
  rank_2 = DTScore(X, y)
  interleaved_ranking = [val for pair in zip(rank_2, rank_1) for val in pair]
  ranking_no_duplicates = list(dict.fromkeys(interleaved_ranking))
  ranking_array_features_places = np.zeros(X.shape[1])
  j = 1
  for i in range(X.shape[1]):
    ranking_of_i_best_feature = ranking_no_duplicates[i] - 1
    ranking_array_features_places[ranking_of_i_best_feature] = j
    j += 1
  return ranking_array_features_places

import random
def set_random_seed(seed_value):
    np.random.seed(seed_value)
    random.seed(seed_value)
    
for i in range(3):
  set_random_seed(i * 42)
  print(f"Iteration {i + 1} of algo2 on toy example, feature rankings:\n{algo2(X,y)}")
print("algo2 is deterministic so there is no difference between the 3 iterations")


Iteration 1 of algo2 on toy example, feature rankings:
[11. 41. 17. 39. 43. 13. 44. 42. 35. 34. 38. 40.  8. 32. 14. 12.  3. 15.
 30. 19. 37. 36. 28. 25.  2.  4. 22. 20. 18.  6. 33. 31. 29. 27. 26. 24.
 23. 21. 16.  1. 10.  7.  9.  5.]
Iteration 2 of algo2 on toy example, feature rankings:
[11. 41. 17. 39. 43. 13. 44. 42. 35. 34. 38. 40.  8. 32. 14. 12.  3. 15.
 30. 19. 37. 36. 28. 25.  2.  4. 22. 20. 18.  6. 33. 31. 29. 27. 26. 24.
 23. 21. 16.  1. 10.  7.  9.  5.]
Iteration 3 of algo2 on toy example, feature rankings:
[11. 41. 17. 39. 43. 13. 44. 42. 35. 34. 38. 40.  8. 32. 14. 12.  3. 15.
 30. 19. 37. 36. 28. 25.  2.  4. 22. 20. 18.  6. 33. 31. 29. 27. 26. 24.
 23. 21. 16.  1. 10.  7.  9.  5.]
algo2 is determinstic so there is no difference between the 3 iterations
