In [1]:
# https://stackoverflow.com/questions/21971449/how-do-i-increase-the-cell-width-of-the-jupyter-ipython-notebook-in-my-browser

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, LeaveOneOut
from sklearn.metrics import roc_curve, auc, plot_confusion_matrix, plot_precision_recall_curve, classification_report

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from tqdm.notebook import tqdm

plt.style.use('seaborn-whitegrid')

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import os
import gc
import time
import copy
import torch
import model_utils as u
import model_evaluation as me

In [4]:
torch.use_deterministic_algorithms(True)
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":16:8"

In [5]:
import warnings
warnings.simplefilter('ignore')

In [6]:
# https://pytorch.org/docs/stable/notes/randomness.html
seed = 322
u.set_all_seeds(seed)

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

---
---
---

## get the names of all files in folder D:\CANCER BIOLOGY\DATASET\TCGA\FROM Xena\SECOND_ITERATION\cumulative_lists

The process is simple:

1. Take one file

2. Collect all the genes mentioned in that file

3. Compute the frequency of each genes

4. Filter out the genes that  have a frequency  >= 5. That is, they appeared more than or equal to 50% of iteration.

In [8]:
second_iter_path = 'D:/CANCER BIOLOGY/DATASET/TCGA/FROM Xena/SECOND_ITERATION/'

In [9]:
cumulative_lists = os.listdir(second_iter_path+"cumulative_lists")

In [10]:
cumulative_lists

['dl_100.kd',
 'dl_150.kd',
 'dl_50.kd',
 'gs_100.kd',
 'gs_150.kd',
 'gs_50.kd',
 'gs_dl_100.kd',
 'gs_dl_150.kd',
 'gs_dl_50.kd',
 'gs_ig_100.kd',
 'gs_ig_150.kd',
 'gs_ig_50.kd',
 'ig_100.kd',
 'ig_150.kd',
 'ig_50.kd',
 'ig_dl_100.kd',
 'ig_dl_150.kd',
 'ig_dl_50.kd',
 'ig_dl_gs_100.kd',
 'ig_dl_gs_150.kd',
 'ig_dl_gs_50.kd',
 'prod_gs_dl_100.kd',
 'prod_gs_dl_150.kd',
 'prod_gs_dl_50.kd',
 'prod_gs_ig_100.kd',
 'prod_gs_ig_150.kd',
 'prod_gs_ig_50.kd',
 'prod_ig_dl_100.kd',
 'prod_ig_dl_150.kd',
 'prod_ig_dl_50.kd',
 'prod_ig_dl_gs_100.kd',
 'prod_ig_dl_gs_150.kd',
 'prod_ig_dl_gs_50.kd']

In [11]:
for ls in cumulative_lists:
    gene_list=[]
    with open(second_iter_path+'cumulative_lists/'+ls, "r") as file:
        for gene in file:
            gene=gene.strip()
            gene_list.append(gene)
            
            
    gene_list.sort()
    df_gene_list = pd.Series(gene_list).value_counts().reset_index()
    df_gene_list.rename(columns={'index':'Gene Names', 0:'Frequency'}, inplace=True)
    genes_freq_5_or_above = list(df_gene_list.loc[df_gene_list['Frequency']>=5, 'Gene Names'])
    genes_freq_5_or_above.sort()
    with open(second_iter_path+'cumulative_lists_results/freq_geq_5/'+ls, "w") as file:
        for gene in list(genes_freq_5_or_above):
            file.write("%s\n" % gene)