In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, log_loss
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np


In [2]:
# Chargement des données
df_data = pd.read_csv("clean_data.csv")
df_metadata = pd.read_csv("metadata.csv")
# df_data_t = df_data.transpose()
# df_metadata

# Convertir les données en ndarray et supprimer les colonnes inutiles
X = df_data.loc[:, ~df_data.columns.str.contains('^Unnamed')].values
X = X.T

# Générer les labels en fonction d'une colonne choisie
s = pd.Series(df_metadata["tumor_stage"].values)
labels,values = pd.factorize(s)
y = labels

print("Shape of X :", X.shape)
print("Shape of y :", y.shape)

print("labels :", values)


Shape of X : (685, 20103)
Shape of y : (685,)
labels : Index(['stage ib', 'stage ia', 'stage i', 'stage iib', 'stage iv',
       'stage iiia', 'not reported', 'stage iia', 'stage ii', 'stage iiib'],
      dtype='object')


In [3]:
# selection des k best features grâce au test chi2

chi2_selector = SelectKBest(chi2, k=1000)
X = chi2_selector.fit_transform(X, y)
print(X.shape)

(685, 1000)


In [13]:
X_2.shape

(1000,)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # stratify=y, mais une classe avec 1 occurences,on vire cette classe? 

## Données originales

In [5]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components = 100)
A = svd.fit_transform(X_train)
T = svd.components_
print(A.shape)
print(T.shape)
D = np.dot(A,T)
print(D.shape)
################ WARNING ###############
# pearson = np.corrcoef(D,X_art)
# print("Pearson =", pearson) 
########################################

rmse = mean_squared_error(D,X_train)
print("RMSE =", rmse)

(548, 100)
(100, 1000)
(548, 1000)
RMSE = 0.0014479786530833807


In [6]:
from sklearn.svm import SVC

model = SVC(C=10.)
model.fit(A,y_train)
pred_train = model.predict(A)

print("Train:")
print(classification_report(y_train,pred_train))
A_test = svd.fit_transform(X_test)
pred_test = model.predict(A_test)

print("Test:")
print(classification_report(y_test,pred_test))



Train:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        49
           0       0.46      0.79      0.58       141
           1       0.51      0.82      0.63       131
           2       1.00      0.40      0.57         5
           3       0.92      0.19      0.32        63
           4       1.00      0.23      0.38        26
           5       0.87      0.19      0.31        68
           6       1.00      0.11      0.20         9
           7       1.00      0.14      0.25        42
           8       0.00      0.00      0.00         1
           9       1.00      0.31      0.47        13

   micro avg       0.57      0.57      0.57       548
   macro avg       0.80      0.38      0.43       548
weighted avg       0.72      0.57      0.52       548

Test:
              precision    recall  f1-score   support

          -1       0.45      0.75      0.56        12
           0       0.24      0.60      0.34        30
           

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Genérer des données méthode 1 : moyenne, variance

In [12]:
def generate_artificial_data(data, lbls, nb_of_new, noise = True):
    
    sigma_noise = 10e-2
    new_X = np.zeros((data.shape[1],0))
    new_y = np.zeros((0))

    for label in np.unique(y):

        X_cond = data[np.where(lbls==label)] # X correspondant à un label    
        # estimation des paramètres de la normale
        mean_X = np.mean(X_cond, axis=0)
        std_X = np.std(X_cond, axis=0) 

        # nombre de nouveaux patiends à ajouter

        nb_new_patients_cond = int(nb_of_new*(len(lbls[np.where(lbls==label)])/lbls.shape[0]))
        

        # generer les labels
        
        y_cond = np.empty(nb_new_patients_cond)
        y_cond.fill(label)

        
        # boucle qui genere des nombres aléatoires suivant
        # une loi normale avec les paramètres calculés

        new_X_cond = np.zeros((0,nb_new_patients_cond))

        for i in range(X.shape[1]):
            new_feature = np.random.normal(mean_X[i], std_X[i], (1,nb_new_patients_cond))
            if(noise):
                new_feature += np.random.normal(0,1, new_feature.shape)*sigma_noise
            new_X_cond = np.append(new_X_cond,new_feature, axis=0)

        new_X = np.concatenate((new_X,new_X_cond),axis=1)
        new_y = np.append(new_y, y_cond)

    new_X = MinMaxScaler().fit_transform(new_X.T)
    
    
    print(new_X.shape)
    print(new_y.shape)
    
    return (new_X,new_y)



In [13]:
X_art,y_art = generate_artificial_data(X_train,y_train,15001)

#X_train, X_test, y_train, y_test = train_test_split(X_art, y_art, test_size=0.2, random_state=42)

(14995, 1000)
(14995,)


In [16]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components = 100)
A = svd.fit_transform(X_art)
A_test = svd.fit_transform(X_test)

T = svd.components_
print(A.shape)
print(T.shape)
D = np.dot(A,T)
print(D.shape)
################ WARNING ###############
# pearson = np.corrcoef(D,X_art)
# print("Pearson =", pearson) 
########################################

rmse = mean_squared_error(D,X_art)
print("RMSE =", rmse)

(14995, 100)
(100, 1000)
(14995, 1000)
RMSE = 0.10442430121866922


In [17]:
from sklearn.svm import SVC

model = SVC(C=10.)
model.fit(A_train,y_art)
pred_train = model.predict(A_train)

print("Train:")
print(classification_report(y_art,pred_train))
pred_test = model.predict(A_test)

print("Test:")
print(classification_report(y_test,pred_test))



Train:
              precision    recall  f1-score   support

        -1.0       1.00      1.00      1.00      1341
         0.0       0.77      0.84      0.81      3859
         1.0       0.88      0.91      0.90      3586
         2.0       1.00      1.00      1.00       136
         3.0       0.79      0.79      0.79      1724
         4.0       0.91      0.83      0.87       711
         5.0       0.71      0.56      0.62      1861
         6.0       1.00      1.00      1.00       246
         7.0       0.98      0.97      0.97      1149
         8.0       1.00      1.00      1.00        27
         9.0       0.98      0.90      0.94       355

   micro avg       0.85      0.85      0.85     14995
   macro avg       0.91      0.89      0.90     14995
weighted avg       0.85      0.85      0.85     14995

Test:
              precision    recall  f1-score   support

        -1.0       0.13      1.00      0.22        12
         0.0       0.00      0.00      0.00        30
         1.

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## Genérer des données méthode 2 : SMOTE

In [19]:
# 6 occurences nécessaire pour Smote ou Adasyn, on enlève la classe 8 et on duplique occurence de 2  la classe 5 


from collections import Counter
from imblearn.over_sampling import ADASYN, SMOTE

recounted = Counter(y_train)
print(recounted)

X_train_sans_8= X_train[np.where(y_train!=8)  ]
X_2 = X_train[np.where(y_train==2)  ][0]
print(X_train_sans_8.shape)
print(X_2.shape)
X_train_sans_8_1 = np.vstack( [X_train_sans_8,X_2  ])
X_train_sans_8_1.shape



y_train_sans_8= y_train[np.where(y_train!=8)  ]

y_train_sans_8 = np.append(y_train_sans_8, 2)

Counter({0: 141, 1: 131, 5: 68, 3: 63, -1: 49, 7: 42, 4: 26, 9: 13, 6: 9, 2: 5, 8: 1})
(547, 1000)
(1000,)


In [20]:

# nombre d'occurences désirées par classe 
# # on a enlevé la classe 8 car qu'une occurence impossible d'appliquer SMOTE ou ADASYN
dict= {0: 1000, 1: 1000, 5: 1000, 3: 1000, -1: 1000, 7: 1000, 4: 1000, 9: 1000, 6: 1000, 2: 1000}  
smote = SMOTE(random_state=42, sampling_strategy=dict)
X_resampled, y_resampled = smote.fit_resample(X_train_sans_8_1, y_train_sans_8)
print(X_resampled.shape)
print(X_train.shape)


  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))


(10000, 1000)
(548, 1000)


In [21]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components = 100)
A = svd.fit_transform(X_resampled)
A_test = svd.fit_transform(X_test)

T = svd.components_
print(A.shape)
print(T.shape)
D = np.dot(A,T)
print(D.shape)
################ WARNING ###############
# pearson = np.corrcoef(D,X_art)
# print("Pearson =", pearson) 
########################################

rmse = mean_squared_error(D,X_resampled)
print("RMSE =", rmse)


(10000, 100)
(100, 1000)
(10000, 1000)
RMSE = 0.01251947645624487


In [22]:
from sklearn.svm import SVC

model = SVC(C=10.)
model.fit(A,y_resampled)
pred_train = model.predict(A)

print("Train:")
print(classification_report(y_resampled,pred_train))
pred_test = model.predict(A_test)

print("Test:")
print(classification_report(y_test,pred_test))





Train:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      1000
           0       0.88      0.81      0.84      1000
           1       0.78      0.83      0.81      1000
           2       1.00      1.00      1.00      1000
           3       0.92      0.91      0.92      1000
           4       0.98      1.00      0.99      1000
           5       0.92      0.91      0.91      1000
           6       1.00      1.00      1.00      1000
           7       0.97      0.98      0.98      1000
           9       1.00      1.00      1.00      1000

   micro avg       0.94      0.94      0.94     10000
   macro avg       0.94      0.94      0.94     10000
weighted avg       0.94      0.94      0.94     10000

Test:
              precision    recall  f1-score   support

          -1       0.52      0.92      0.67        12
           0       0.29      0.30      0.30        30
           1       0.21      0.22      0.21        27
           

  'recall', 'true', average, warn_for)


In [23]:
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN

model1 = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)

model1.fit(A,y_resampled)
pred_train = model1.predict(A)

print("Train:")
print(classification_report(y_resampled,pred_train))
pred_test = model1.predict(A_test)

print("Test:")
print(classification_report(y_test,pred_test))

Train:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      1000
           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00      1000
           2       1.00      1.00      1.00      1000
           3       1.00      1.00      1.00      1000
           4       1.00      1.00      1.00      1000
           5       1.00      1.00      1.00      1000
           6       1.00      1.00      1.00      1000
           7       1.00      1.00      1.00      1000
           9       1.00      1.00      1.00      1000

   micro avg       1.00      1.00      1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Test:
              precision    recall  f1-score   support

          -1       0.79      0.92      0.85        12
           0       0.28      0.43      0.34        30
           1       0.20      0.15      0.17        27
           

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### Générer des données méthode 3 : ADASYN

In [26]:
# nombre d'occurences désirées par classe 
# # on a enlevé la classe 8 car qu'une occurence impossible d'appliquer SMOTE ou ADASYN
dict= {0: 1000, 1: 1000, 5: 1000, 3: 1000, -1: 1000, 7: 1000, 4: 1000, 9: 1000, 6: 1000, 2: 1000}  
ada = ADASYN(random_state=42, sampling_strategy=dict)
X_resampled_ada, y_resampled_ada = ada.fit_resample(X_train_sans_8_1, y_train_sans_8)
print(X_resampled.shape)
print(X_train.shape)

  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))


(10000, 1000)
(548, 1000)


In [27]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components = 100)
A = svd.fit_transform(X_resampled_ada)
A_test = svd.fit_transform(X_test)

T = svd.components_
print(A.shape)
print(T.shape)
D = np.dot(A,T)
print(D.shape)
################ WARNING ###############
# pearson = np.corrcoef(D,X_art)
# print("Pearson =", pearson) 
########################################

rmse = mean_squared_error(D,X_resampled_ada)
print("RMSE =", rmse)


(9991, 100)
(100, 1000)
(9991, 1000)
RMSE = 0.011733816776209374


In [28]:
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN

model1 = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)

model1.fit(A,y_resampled_ada)
pred_train = model1.predict(A)

print("Train:")
print(classification_report(y_resampled_ada,pred_train))
pred_test = model1.predict(A_test)

print("Test:")
print(classification_report(y_test,pred_test))

Train:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      1001
           0       1.00      1.00      1.00       978
           1       1.00      1.00      1.00       997
           2       1.00      1.00      1.00      1002
           3       1.00      1.00      1.00       987
           4       1.00      1.00      1.00       998
           5       1.00      1.00      1.00      1014
           6       1.00      1.00      1.00      1003
           7       1.00      1.00      1.00      1012
           9       1.00      1.00      1.00       999

   micro avg       1.00      1.00      1.00      9991
   macro avg       1.00      1.00      1.00      9991
weighted avg       1.00      1.00      1.00      9991

Test:
              precision    recall  f1-score   support

          -1       0.69      0.92      0.79        12
           0       0.13      0.17      0.15        30
           1       0.20      0.19      0.19        27
           

In [12]:
X_train_art,y_train_art = generate_artificial_data(X,y,15001)
X_test_art, y_test_art = generate_artificial_data(X,y,5001)

df_train_X = pd.DataFrame(X_train_art)
df_train_X.to_csv("data/challenge/train/data.csv")
df_train_y = pd.DataFrame(y_train_art)
df_train_y.to_csv("data/challenge/train/metadata.csv")

df_test_X = pd.DataFrame(X_test_art)
df_test_X.to_csv("data/challenge/test/data.csv")
df_test_y = pd.DataFrame(y_test_art)
df_test_y.to_csv("data/challenge/test/metadata.csv")

(14994, 1000)
(14994,)
(4996, 1000)
(4996,)


In [14]:
ar1 = np.zeros((1,5))
print(ar1)
print(ar1 + np.random.normal(0,1, ar1.shape)*1e-2)

[[0. 0. 0. 0. 0.]]
[[ 0.00163597  0.0052493   0.00546016 -0.00536443  0.0025316 ]]
