In [22]:
import numpy as np
import pandas as pd

In [23]:
# Specify the path to your .txt file
file_path_survial_data = 'Dataset/survival-OV_survival.txt'

# Define column names based on your data
columns_survival = ["sample", "_PATIENT", "OS.time"]

# Read the data from the file into a Pandas DataFrame
df_survival = pd.read_csv(file_path_survial_data, delimiter=r'\s+', header=0, na_values=["", " ", "Redacted"])

# Create a new DataFrame with the selected columns
df_survival = df_survival[columns_survival]
df_survival = df_survival.dropna()

print(df_survival)


              sample      _PATIENT  OS.time
0    TCGA-04-1331-01  TCGA-04-1331   1336.0
1    TCGA-04-1332-01  TCGA-04-1332   1247.0
2    TCGA-04-1335-01  TCGA-04-1335     55.0
3    TCGA-04-1336-01  TCGA-04-1336   1495.0
4    TCGA-04-1337-01  TCGA-04-1337     61.0
..               ...           ...      ...
599  TCGA-61-2614-01  TCGA-61-2614    262.0
600  TCGA-OY-A56P-01  TCGA-OY-A56P   1207.0
601  TCGA-OY-A56Q-01  TCGA-OY-A56Q    576.0
602  TCGA-VG-A8LO-01  TCGA-VG-A8LO     24.0
603  TCGA-WR-A838-01  TCGA-WR-A838    304.0

[602 rows x 3 columns]


In [24]:
# Specify the path to your .txt file
file_path_vital = 'Dataset/TCGA.OV.sampleMap-OV_clinicalMatrix'

# Read the data from the file into a Pandas DataFrame
df_vital = pd.read_csv(file_path_vital, sep='\t', header=0, na_values=["", " ", "Redacted"])

# Create a new DataFrame with the selected columns
df_vital = df_vital[["sampleID", "vital_status"]]
df_vital = df_vital.dropna()

print(df_vital)


            sampleID vital_status
10   TCGA-04-1331-01     DECEASED
11   TCGA-04-1332-01     DECEASED
12   TCGA-04-1335-01     DECEASED
13   TCGA-04-1336-01       LIVING
14   TCGA-04-1337-01     DECEASED
..               ...          ...
615  TCGA-61-2614-01     DECEASED
626  TCGA-OY-A56P-01       LIVING
627  TCGA-OY-A56Q-01       LIVING
628  TCGA-VG-A8LO-01     DECEASED
629  TCGA-WR-A838-01     DECEASED

[605 rows x 2 columns]


In [25]:
df_merged = pd.merge(df_survival, df_vital, left_on='sample', right_on='sampleID', how='inner')
df_merged['cancer_status'] = ((df_merged['OS.time'] < 1095) | (df_merged['vital_status'] == 'DECEASED')).astype(int)
print(df_merged)

              sample      _PATIENT  OS.time         sampleID vital_status  \
0    TCGA-04-1331-01  TCGA-04-1331   1336.0  TCGA-04-1331-01     DECEASED   
1    TCGA-04-1332-01  TCGA-04-1332   1247.0  TCGA-04-1332-01     DECEASED   
2    TCGA-04-1335-01  TCGA-04-1335     55.0  TCGA-04-1335-01     DECEASED   
3    TCGA-04-1336-01  TCGA-04-1336   1495.0  TCGA-04-1336-01       LIVING   
4    TCGA-04-1337-01  TCGA-04-1337     61.0  TCGA-04-1337-01     DECEASED   
..               ...           ...      ...              ...          ...   
597  TCGA-61-2614-01  TCGA-61-2614    262.0  TCGA-61-2614-01     DECEASED   
598  TCGA-OY-A56P-01  TCGA-OY-A56P   1207.0  TCGA-OY-A56P-01       LIVING   
599  TCGA-OY-A56Q-01  TCGA-OY-A56Q    576.0  TCGA-OY-A56Q-01       LIVING   
600  TCGA-VG-A8LO-01  TCGA-VG-A8LO     24.0  TCGA-VG-A8LO-01     DECEASED   
601  TCGA-WR-A838-01  TCGA-WR-A838    304.0  TCGA-WR-A838-01     DECEASED   

     cancer_status  
0                1  
1                1  
2           

In [11]:
df_class = df_merged[['sample', '_PATIENT', 'cancer_status']]
df_class = df_class.rename(columns={'sample': 'sampleID'})
print(df_class)

            sampleID      _PATIENT  cancer_status
0    TCGA-04-1331-01  TCGA-04-1331              1
1    TCGA-04-1332-01  TCGA-04-1332              1
2    TCGA-04-1335-01  TCGA-04-1335              1
3    TCGA-04-1336-01  TCGA-04-1336              0
4    TCGA-04-1337-01  TCGA-04-1337              1
..               ...           ...            ...
597  TCGA-61-2614-01  TCGA-61-2614              1
598  TCGA-OY-A56P-01  TCGA-OY-A56P              0
599  TCGA-OY-A56Q-01  TCGA-OY-A56Q              1
600  TCGA-VG-A8LO-01  TCGA-VG-A8LO              1
601  TCGA-WR-A838-01  TCGA-WR-A838              1

[602 rows x 3 columns]


In [12]:
value_counts = df_class['cancer_status'].value_counts()
print(value_counts)

cancer_status
1    503
0     99
Name: count, dtype: int64


In [13]:
# Specify the path to your .txt file
file_path_PANCAN = 'Dataset/TCGA.OV.sampleMap_HiSeqV2_PANCAN'

# Read the data from the file into a Pandas DataFrame
df_PANCAN = pd.read_csv(file_path_PANCAN, delimiter=r'\s+', header=0, na_values=["", " ", "Redacted"], index_col=0)

# Create a new DataFrame with the selected columns
df_PANCAN = df_PANCAN.transpose()
df_PANCAN = df_PANCAN.reset_index()
df_PANCAN = df_PANCAN.rename(columns={'index': 'sampleID'})

merged2_df = pd.merge(df_PANCAN, df_class, left_on='sampleID', right_on='sampleID', how='inner')
df_PANCAN = merged2_df[df_PANCAN.columns]
df_PANCAN = df_PANCAN.dropna()
print(df_PANCAN)


            sampleID  ARHGEF10L     HIF3A     RNF17     RNF10     RNF11  \
0    TCGA-61-1910-01  -0.262892 -2.276126 -0.531035  0.657528 -0.865078   
1    TCGA-61-1728-01  -0.543692  1.810774 -0.531035  0.071628 -0.181678   
2    TCGA-09-1666-01  -1.668892  0.123074 -0.531035 -0.753772  0.343322   
3    TCGA-24-1469-01  -1.281392  4.097674 -0.531035 -0.333972 -0.507178   
4    TCGA-04-1348-01  -1.144492  4.363674 -0.531035  0.358628 -0.729078   
..               ...        ...       ...       ...       ...       ...   
303  TCGA-25-2404-01  -0.650692  1.622274 -0.531035  0.275528 -0.180578   
304  TCGA-61-2095-01   0.385408  4.484274  0.290565  0.115528 -0.228178   
305  TCGA-29-1702-01  -1.699992  5.387374 -0.531035 -0.670972  0.308122   
306  TCGA-24-1417-01  -1.256792  3.476574 -0.531035  0.012128  0.070622   
307  TCGA-57-1585-01   0.229008  3.444374 -0.531035 -0.139572 -0.144078   

       RNF13   GTF2IP1      REM1     MTVR2  ...     TULP2     NPY5R     GNGT2  \
0   -1.81391  0.37

In [14]:
# Specify the path to your .txt file
file_path_HumanMethylation27 = 'Dataset/TCGA.OV.sampleMap_HumanMethylation27'

# Read the data from the file into a Pandas DataFrame
df_HumanMethylation27 = pd.read_csv(file_path_HumanMethylation27, delimiter=r'\s+', header=0, na_values=["", " ", "Redacted"], index_col=0)

# Create a new DataFrame with the selected columns
df_HumanMethylation27 = df_HumanMethylation27.transpose()
df_HumanMethylation27 = df_HumanMethylation27.reset_index()
df_HumanMethylation27 = df_HumanMethylation27.rename(columns={'index': 'sampleID'})
merged1_df = pd.merge(df_HumanMethylation27, df_class, left_on='sampleID', right_on='sampleID', how='inner')
df_HumanMethylation27 = merged1_df[df_HumanMethylation27.columns]
# Assuming df_HumanMethylation27 is your DataFrame
threshold = 5

# Drop columns with more than 10 missing values
df_HumanMethylation27 = df_HumanMethylation27.dropna(axis=1, thresh=len(df_HumanMethylation27) - threshold)
df_HumanMethylation27 = df_HumanMethylation27.dropna()
print(df_HumanMethylation27)
# print(df_HumanMethylation27.isnull().sum())


            sampleID  cg00000292  cg00002426  cg00003994  cg00005847  \
2    TCGA-13-1819-02      0.7734      0.7666      0.0284      0.1780   
3    TCGA-31-1953-01      0.7804      0.0808      0.0303      0.6370   
4    TCGA-13-1819-01      0.8053      0.3654      0.0438      0.5863   
5    TCGA-25-2392-01      0.8874      0.2355      0.0418      0.8103   
7    TCGA-24-1552-01      0.8990      0.0202      0.0235      0.0579   
..               ...         ...         ...         ...         ...   
585  TCGA-24-1545-01      0.8506      0.2008      0.0182      0.2712   
586  TCGA-10-0935-01      0.8021      0.0380      0.6720      0.4855   
587  TCGA-04-1638-01      0.6975      0.0258      0.0453      0.9007   
588  TCGA-23-1118-01      0.9012      0.0863      0.0363      0.5696   
589  TCGA-36-2547-01      0.8258      0.1884      0.0450      0.6160   

     cg00008493  cg00008713  cg00009407  cg00010193  cg00011459  ...  \
2        0.9898      0.0105      0.0095      0.6027      0.9542

In [15]:
# Specify the path to your .txt file
file_path_Gistic2 = 'Dataset/TCGA.OV.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes'

# Read the data from the file into a Pandas DataFrame
df_Gistic2 = pd.read_csv(file_path_Gistic2, delimiter=r'\s+', header=0, na_values=["", " ", "Redacted"], index_col=0)

# Create a new DataFrame with the selected columns
df_Gistic2 = df_Gistic2.transpose()
df_Gistic2 = df_Gistic2.reset_index()
df_Gistic2 = df_Gistic2.rename(columns={'index': 'sampleID'})
merged1_df = pd.merge(df_Gistic2, df_class, left_on='sampleID', right_on='sampleID', how='inner')
df_Gistic2 = merged1_df[df_Gistic2.columns]
df_Gistic2 = df_Gistic2.dropna()
print(df_Gistic2)

            sampleID  ACAP3  ACTRT2   AGRN  ANKRD65  ATAD3A  ATAD3B  ATAD3C  \
0    TCGA-04-1331-01  0.080   0.080  0.080    0.080   0.080   0.080   0.080   
1    TCGA-04-1332-01 -0.807  -0.807 -0.807   -0.807  -0.807  -0.807  -0.807   
2    TCGA-04-1335-01  0.101   0.101  0.101    0.101   0.101   0.101   0.101   
3    TCGA-04-1336-01  0.021   0.021  0.021    0.021   0.021   0.021   0.021   
4    TCGA-04-1337-01 -0.999  -0.999 -0.999   -0.999  -0.999  -0.999  -0.999   
..               ...    ...     ...    ...      ...     ...     ...     ...   
561  TCGA-61-2613-01 -0.134  -0.134 -0.134   -0.134  -0.134  -0.134  -0.134   
562  TCGA-61-2614-01  0.224   0.224  0.224    0.224   0.224   0.224   0.224   
563  TCGA-OY-A56P-01  0.000   0.000  0.000    0.000   0.000   0.000   0.000   
564  TCGA-OY-A56Q-01  0.137   0.137  0.137    0.137   0.137   0.137   0.137   
565  TCGA-VG-A8LO-01 -0.176  -0.176 -0.176   -0.176  -0.176  -0.176  -0.176   

     AURKAIP1  B3GALT6  ...  SMIM9  SNORA36A  SNORA

In [16]:
import pandas as pd

# Assuming 'sampleID' is the common column in all dataframes

# Get the unique 'sampleID' from each dataframe
sampleID_Gistic2 = set(df_Gistic2['sampleID'])
sampleID_HumanMethylation27 = set(df_HumanMethylation27['sampleID'])
sampleID_PANCAN = set(df_PANCAN['sampleID'])
sampleID_class = set(df_class['sampleID'])

# Find the common 'sampleID' using set intersection
common_sampleID = (
    sampleID_Gistic2
    & sampleID_HumanMethylation27
    & sampleID_PANCAN
    & sampleID_class
)

# Create a dataframe with the common 'sampleID'
df_common_sampleID = pd.DataFrame({'sampleID': list(common_sampleID)})

# Display the resulting dataframe
# print(df_common_sampleID)

# Get the common 'sampleID' set
common_sampleID_set = set(df_common_sampleID['sampleID'])

# Filter each dataframe to include only rows with common 'sampleID'
df_Gistic2 = df_Gistic2[df_Gistic2['sampleID'].isin(common_sampleID_set)]
df_HumanMethylation27 = df_HumanMethylation27[df_HumanMethylation27['sampleID'].isin(common_sampleID_set)]
df_PANCAN = df_PANCAN[df_PANCAN['sampleID'].isin(common_sampleID_set)]
df_class = df_class[df_class['sampleID'].isin(common_sampleID_set)]


In [17]:
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler
scaler = MinMaxScaler()

def Scaler(df):
    # Extract numeric columns (you may need to adjust this based on your dataframe)
    numeric_columns = df.select_dtypes(include=['number']).columns
    # Fit and transform the numeric columns
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    return df


df_Gistic2 = Scaler(df_Gistic2)
df_HumanMethylation27 = Scaler(df_HumanMethylation27)
df_PANCAN = Scaler(df_PANCAN)



In [18]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

def VarianceThreshold(df, threshold = 0.02):
    # Save the 'sampleID' column
    sampleID_column = df['sampleID']

    # Drop the 'sampleID' column before calculating variance
    df_without_sampleID = df.drop(columns=['sampleID'])

    # Calculate variance for each remaining column
    variances = df_without_sampleID.var()

    # Filter columns with variance >= threshold
    selected_columns = variances[variances >= threshold].index

    # Add back the 'sampleID' column to the selected columns
    selected_columns = ['sampleID'] + list(selected_columns)

    # Create a new DataFrame with selected columns
    filtered_df = df[selected_columns]

    # Display the resulting DataFrame
    return filtered_df
    
df_Gistic2 = VarianceThreshold(df_Gistic2)
df_HumanMethylation27 = VarianceThreshold(df_HumanMethylation27)
df_PANCAN = VarianceThreshold(df_PANCAN)

In [19]:
def sortData(df):
    # Sort DataFrame based on 'sampleID'
    df_sorted = df.sort_values(by='sampleID')

    # Reset the index
    df_sorted = df_sorted.reset_index(drop=True)

    # Display the sorted and reset DataFrame
    return df_sorted

df_Gistic2 = sortData(df_Gistic2)
df_HumanMethylation27 = sortData(df_HumanMethylation27)
df_PANCAN = sortData(df_PANCAN)
df_class = sortData(df_class)

In [20]:
th = 0.04
df_Gistic21 = VarianceThreshold(df_Gistic2, th)
df_HumanMethylation271 = VarianceThreshold(df_HumanMethylation27, th)
df_PANCAN1 = VarianceThreshold(df_PANCAN, th)
print(df_PANCAN1.shape)
print(df_HumanMethylation271.shape)
print(df_Gistic21.shape)

(217, 3155)
(217, 5245)
(217, 1175)


In [21]:
value_counts = df_class['cancer_status'].value_counts()
print(value_counts)

cancer_status
1    192
0     25
Name: count, dtype: int64


In [15]:
df_0 = df_PANCAN1                                 #0
df_1 = df_Gistic21                                #1
df_2 = df_HumanMethylation271                     #2

n0 = df_0.shape[0]
n1 = df_1.shape[0]
n2 = df_2.shape[0]


In [16]:
from sklearn.linear_model import Lasso
import numpy as np
import pandas as pd

def LassoFeatureSelection(df, target):
    # Assuming df_0 is your DataFrame containing features

    # Separate the features (X) and the target variable (y) if applicable
    X = df.drop(columns=['sampleID'])  # Remove 'target_column' if it exists
    y = target  # Replace 'target_column' with your target variable name

    # Step 1: Instantiate Lasso Model
    lasso = Lasso(alpha=0.001)  # You can adjust the alpha value as needed

    # Step 2: Fit Model
    lasso.fit(X, y)

    # Step 3: Get Feature Importance
    feature_importance = np.abs(lasso.coef_)

    # Step 4: Select Top Features
    num_top_features = 100  # Number of top features to select
    top_feature_indices = np.argsort(feature_importance)[::-1][:num_top_features]
    top_features = X.columns[top_feature_indices]

    # Create a new DataFrame with the top features
    df_top_features = df[top_features]

    df_top_features = pd.concat([df['sampleID'], df_top_features], axis=1)

    # Display the new DataFrame with top features
    return df_top_features


In [17]:
df_0 = LassoFeatureSelection(df_0,df_class['cancer_status'])
df_1 = LassoFeatureSelection(df_1,df_class['cancer_status'])
df_2 = LassoFeatureSelection(df_2,df_class['cancer_status'])

In [18]:
# Assuming df_0 is your DataFrame
# column_names = df_0.columns

# Create a map0 mapping column names to their numbers indexed from 1
map0 = {column: (0, index) for index, column in enumerate(df_0.columns)}
map1 = {column: (1, index) for index, column in enumerate(df_1.columns)}
map2 = {column: (2, index) for index, column in enumerate(df_2.columns)}

print(len(map0))
print(len(map1))
print(len(map2))

# Display map0
# print(map2)

# Assuming map0, map1, map2 are your dictionaries
Feature_to_Index = {}

# Merge map0, map1, and map2 into mapF
Feature_to_Index.update(map2)
print(len(Feature_to_Index))
Feature_to_Index.update(map1)
print(len(Feature_to_Index))
Feature_to_Index.update(map0)
print(len(Feature_to_Index))

# Display mapF

# Create the opposite mapping of mapF
Index_to_Feature = {value: key for key, value in Feature_to_Index.items()}


101
101
101
101
201
300


In [19]:
column_to_index = {column: index for index, column in enumerate(Feature_to_Index.keys())}
index_to_column = {index: column for column, index in column_to_index.items()}
dfs = {0: df_0, 1: df_1, 2: df_2}

In [20]:
from scipy.stats import pearsonr

n_ = len(index_to_column)
g_Function = np.zeros((n_, n_))

for i in range(1,n_):
    for j in range(1,n_):
        col1 = index_to_column[i]
        col2 = index_to_column[j]

        add1 = Feature_to_Index[col1]
        add2 = Feature_to_Index[col2]

        corr_coefficient, p_value = pearsonr(dfs[add1[0]][col1], dfs[add2[0]][col2])
        g_Function[i][j] = abs(corr_coefficient)

print(g_Function)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.04740484 ... 0.07218802 0.09282057 0.04867271]
 [0.         0.04740484 1.         ... 0.06279655 0.12465636 0.11964636]
 ...
 [0.         0.07218802 0.06279655 ... 1.         0.11501993 0.21688761]
 [0.         0.09282057 0.12465636 ... 0.11501993 1.         0.03982409]
 [0.         0.04867271 0.11964636 ... 0.21688761 0.03982409 1.        ]]


In [21]:
df_merged = pd.merge(df_2, df_1, on='sampleID', how='inner')  # Adjust 'how' parameter as needed

# Merge df_merged with df_2
df_merged = pd.merge(df_merged, df_0, on='sampleID', how='inner')  # Adjust 'how' parameter as needed

# Drop the 'ARHGEF38_x' column
df_merged.drop(columns=['ARHGEF38_x'], inplace=True)

# Rename the 'ARHGEF38_y' column to 'ARHGEF38'
df_merged.rename(columns={'ARHGEF38_y': 'ARHGEF38'}, inplace=True)

print(df_merged)

            sampleID  cg02600394  cg17987660  cg21634602  cg19682367  \
0    TCGA-04-1348-01    0.434663    0.207875    0.002905    0.550188   
1    TCGA-04-1357-01    0.252407    0.217949    0.002783    0.699784   
2    TCGA-04-1362-01    0.950481    0.076313    0.007866    0.210095   
3    TCGA-04-1364-01    0.686382    0.053419    0.016580    0.009343   
4    TCGA-04-1365-01    0.660935    0.025946    0.007382    0.931526   
..               ...         ...         ...         ...         ...   
212  TCGA-61-2104-01    0.794360    0.236264    0.009561    0.702062   
213  TCGA-61-2109-01    0.904402    0.022589    0.015854    0.158141   
214  TCGA-61-2110-01    0.853508    0.031746    0.005446    0.877407   
215  TCGA-61-2111-01    0.799862    0.219170    0.019121    0.461319   
216  TCGA-61-2113-01    0.768226    0.843407    0.014765    0.292583   

     cg13754949  cg26177629  cg25094569  cg25195673  cg09542291  ...  \
0      0.307317    0.010408    0.634611    0.132050    0.917344

In [22]:
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

# Calculate Chi-squared score
chi2_score, _ = chi2(df_merged.drop(columns=['sampleID']), df_class['cancer_status'])

# Calculate F-statistic (F-Stat)
f_statistic, _ = f_classif(df_merged.drop(columns=['sampleID']), df_class['cancer_status'])

# Calculate Mutual Information (MI)
mi_score = mutual_info_classif(df_merged.drop(columns=['sampleID']), df_class['cancer_status'])

F_statistic = np.concatenate([np.empty(1), f_statistic])
Chi2_score = np.concatenate([np.empty(1), chi2_score])
MI_score = np.concatenate([np.empty(1), mi_score])

print(F_statistic)

[0.00000000e+00 1.45148487e+01 1.36322311e+00 1.03621262e+01
 2.29492911e+00 2.72278903e+00 2.78199440e+00 9.09484520e+00
 1.73412082e+01 9.60239867e+00 2.93001277e+00 1.87549364e+01
 4.24681245e-01 6.83375880e-01 1.29229457e+01 3.46562479e+00
 1.56448505e+01 4.82641879e+00 7.63006913e+00 3.27730568e+00
 1.78435600e-01 4.28644559e+00 1.06800423e+01 1.47990797e+00
 1.26081981e+01 2.59438520e+00 4.76435878e+00 1.50423756e+00
 7.65761998e+00 3.04726257e+00 3.47880812e+00 1.57693503e+01
 8.68882459e+00 5.62703626e+00 2.79514900e+00 7.76997390e-01
 3.89467230e+00 1.19695264e+00 3.86991330e+00 8.95745501e-01
 2.47488705e+00 1.27585671e+00 4.62988984e+00 4.56451063e-01
 8.39782618e-01 8.85756607e-01 8.64584335e-01 8.58802972e-01
 1.08886945e+00 2.82524459e-01 2.16730126e+00 1.25890530e-01
 2.58426615e+00 4.67300007e+00 6.76455332e+00 1.83015074e+00
 7.53210525e+00 6.98402958e+00 4.40399867e+00 1.63015803e-01
 9.72268226e-01 8.09329050e+00 5.38924635e-02 3.89500246e+00
 8.54101087e+00 1.117569

In [144]:
#MRMR-mv ALGORITHM:

In [30]:

def Argument(Selected_Features, current_Feature, f_values, g_values, t):
    value = 0.0
    index = column_to_index[current_Feature]
    value += f_values[index]
    g_sum = 0.0

    for i in Selected_Features:
        g_sum += g_values[index][i]

    g_sum = g_sum/(1.0*pow(t+1,2))

    value -= g_sum
    return value

def MRMRmv_Implementation(k,f_values,g_values,v,p0,p1,p2):
    S = [[] for _ in range(v)]
    V = np.array([0, 1, 2])
    C = np.random.choice(V, size=k-1, p=[p0, p1, p2])
    Selected_Features = []

    max_index = np.argmax(f_values)
    v1 = Feature_to_Index[index_to_column[max_index]]
    Selected_Features.append(max_index)
    S[v1[0]].append(v1[1])

    for i in range (0,k-1):
        view = C[i]

        ArgMax = -100000000
        Select = -1

        for col in dfs[view].columns:
            if(col == 'sampleID'): 
                continue
            if column_to_index[col] in Selected_Features:
                continue

            val = Argument(Selected_Features,col,f_values,g_values,i)

            if(val>ArgMax):
                ArgMax = val
                Select = column_to_index[col]
        
        if Select==-1:
            continue

        Selected_Features.append(Select)
        v = Feature_to_Index[index_to_column[Select]]
        S[v[0]].append(v[1])

    return S, Selected_Features


In [37]:
S, Selected_Features = MRMRmv_Implementation(10,F_statistic, g_Function, 3, 0.33, 0.33, 0.34)

In [40]:
def Create_DF(Selected_Features):
    df_selected = df_0[['sampleID']].copy()

    for i in Selected_Features:
        col = index_to_column[i]
        add = Feature_to_Index[col]

        df_selected = pd.merge(df_selected, dfs[add[0]][['sampleID', col]], on='sampleID', how='inner')
    
    return df_selected

df_selected = Create_DF(Selected_Features)

In [42]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming X contains the features and y contains the target variable

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(df_selected.drop(columns=['sampleID']), df_class['cancer_status'], test_size=0.2, random_state=42)

# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Initialize Stratified K-Folds cross-validator
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(log_reg, X_train, y_train, cv=cv, scoring='accuracy')

# Fit the model to the training data
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
print("Accuracy on test set:", log_reg.score(X_test, y_test))
print("Cross-validated accuracy:", cv_scores.mean())

# Classification report
print(classification_report(y_test, y_pred))


Accuracy on test set: 0.9318181818181818
Cross-validated accuracy: 0.878655462184874
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.93      1.00      0.96        41

    accuracy                           0.93        44
   macro avg       0.47      0.50      0.48        44
weighted avg       0.87      0.93      0.90        44



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [47]:
# Assuming 'series' is your Pandas Series
value_counts = df_class['cancer_status'].value_counts()
print(value_counts)


cancer_status
1    192
0     25
Name: count, dtype: int64
