In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score,classification_report, confusion_matrix
from scipy.linalg import pinv
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [3]:
df = pd.read_csv('data.tsv',sep='\t')

In [4]:
df.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.7,0.538,C0001418,Adenocarcinoma,group,C04,Neoplastic Process,0.01,1.0,2008.0,2008.0,1,0,LHGDN
1,1,A1BG,0.7,0.538,C0002736,Amyotrophic Lateral Sclerosis,disease,C18;C10,Disease or Syndrome,0.01,1.0,2008.0,2008.0,1,0,BEFREE
2,1,A1BG,0.7,0.538,C0003578,Apnea,phenotype,C23;C08,Sign or Symptom,0.01,1.0,2017.0,2017.0,1,0,BEFREE
3,1,A1BG,0.7,0.538,C0003864,Arthritis,disease,C05,Disease or Syndrome,0.01,1.0,2019.0,2019.0,1,0,BEFREE
4,1,A1BG,0.7,0.538,C0008373,Cholesteatoma,disease,C17,Disease or Syndrome,0.01,1.0,2020.0,2020.0,1,0,BEFREE


In [5]:
df['diseaseType'].unique()

array(['group', 'disease', 'phenotype'], dtype=object)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1134942 entries, 0 to 1134941
Data columns (total 16 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   geneId               1134942 non-null  int64  
 1   geneSymbol           1134942 non-null  object 
 2   DSI                  1132358 non-null  float64
 3   DPI                  1132060 non-null  float64
 4   diseaseId            1134942 non-null  object 
 5   diseaseName          1134942 non-null  object 
 6   diseaseType          1134942 non-null  object 
 7   diseaseClass         978539 non-null   object 
 8   diseaseSemanticType  1134942 non-null  object 
 9   score                1134942 non-null  float64
 10  EI                   967968 non-null   float64
 11  YearInitial          967968 non-null   float64
 12  YearFinal            967968 non-null   float64
 13  NofPmids             1134942 non-null  int64  
 14  NofSnps              1134942 non-null  int64  
 15

In [7]:
num_var = df.select_dtypes(exclude='object')

In [8]:
num_var.fillna(num_var.median(),inplace = True)

In [9]:
cat_var = df.select_dtypes(include='object')

In [10]:
cat_var = cat_var.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [13]:
cat_var

Unnamed: 0,geneSymbol,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,source
0,A1BG,C0001418,Adenocarcinoma,group,C04,Neoplastic Process,LHGDN
1,A1BG,C0002736,Amyotrophic Lateral Sclerosis,disease,C18;C10,Disease or Syndrome,BEFREE
2,A1BG,C0003578,Apnea,phenotype,C23;C08,Sign or Symptom,BEFREE
3,A1BG,C0003864,Arthritis,disease,C05,Disease or Syndrome,BEFREE
4,A1BG,C0008373,Cholesteatoma,disease,C17,Disease or Syndrome,BEFREE
...,...,...,...,...,...,...,...
1134937,CEROX1,C0005890,Body Height,phenotype,C04,Organism Attribute,GWASCAT
1134938,MIR223HG,C0023418,leukemia,disease,C04,Neoplastic Process,BEFREE
1134939,MIR223HG,C0023467,"Leukemia, Myelocytic, Acute",disease,C04,Neoplastic Process,BEFREE
1134940,MIR223HG,C0598766,Leukemogenesis,disease,C23;C04,Neoplastic Process,BEFREE


In [24]:
cat_var.source.value_counts()

source
BEFREE                                        789377
HPO                                           148643
CTD_human                                      44786
GWASCAT                                        34270
CLINVAR                                        14326
                                               ...  
BEFREE;CLINGEN;CLINVAR;MGD                         1
CGI;GENOMICS_ENGLAND;LHGDN                         1
BEFREE;CLINGEN;CLINVAR;HPO                         1
BEFREE;CGI;CLINVAR;CTD_human;HPO;LHGDN;RGD         1
BEFREE;CLINGEN;LHGDN;ORPHANET                      1
Name: count, Length: 878, dtype: int64

In [25]:
cat_var.source.nunique()

878

In [22]:
cat_var.diseaseSemanticType.value_counts()

diseaseSemanticType
Disease or Syndrome                                    439266
Neoplastic Process                                     426243
Finding                                                 74835
Mental or Behavioral Dysfunction                        55095
Congenital Abnormality                                  42736
Sign or Symptom                                         32147
Anatomical Abnormality                                  12984
Pathologic Function                                     12874
Laboratory Procedure                                    12329
Acquired Abnormality                                     5761
Clinical Attribute                                       3408
Organism Attribute                                       3159
Experimental Model of Disease                            1957
Injury or Poisoning                                      1802
Mental Process                                           1735
Laboratory or Test Result                         

In [23]:
cat_var.diseaseSemanticType.nunique()

33

In [14]:
cat_var.geneSymbol.value_counts()

geneSymbol
TNF              2724
TP53             2494
IL6              2367
VEGFA            1899
IL1B             1801
                 ... 
DEFB108C            1
FBXO27              1
H2AP                1
DPRX                1
ZNF559-ZNF177       1
Name: count, Length: 21666, dtype: int64

In [15]:
cat_var.geneSymbol.value_counts().count()

21666

In [16]:
cat_var.diseaseName.value_counts()

diseaseName
Neoplasms                                              10161
Malignant Neoplasms                                     8621
Primary malignant neoplasm                              8221
Malignant neoplasm of breast                            6941
Breast Carcinoma                                        6776
                                                       ...  
Bilateral facial muscle weakness                           1
EPIDERMOLYSIS BULLOSA SIMPLEX WITH NAIL DYSTROPHY          1
Axial muscle atrophy                                       1
Membranous conjunctivitis                                  1
Thyroid Gland Follicular Carcinoma, Widely Invasive        1
Name: count, Length: 30170, dtype: int64

In [18]:
cat_var.diseaseName.nunique()

30170

In [19]:
cat_var.diseaseType.value_counts()

diseaseType
disease      781622
phenotype    211293
group        142027
Name: count, dtype: int64

In [20]:
cat_var.diseaseClass.value_counts()

diseaseClass
C04                                        289130
C06;C04                                     50706
C14                                         33452
C23                                         32747
C23;C10                                     28039
                                            ...  
C23;C16;C17;C10;C19;F03;F01                     1
C16;C13;C05;C10                                 1
C04;C05;C20;C15;C14                             1
C16;C05;C08                                     1
C23;C16;C18;C13;C11;C05;C10;C19;F03;F01         1
Name: count, Length: 1106, dtype: int64

In [21]:
cat_var.diseaseClass.nunique()

1106

In [26]:
from sklearn.cluster import KMeans

In [32]:
label_encoder = LabelEncoder()
df['diseaseName_encoded'] = label_encoder.fit_transform(df['diseaseName'])
disease_data = df['diseaseName_encoded'].values.reshape(-1, 1)
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(disease_data)
cluster_summary = df.groupby('cluster')['diseaseName'].apply(lambda x: list(x)[:10])  # Example: get 10 items from each cluster
print(cluster_summary)
cluster_names = {
    0: 'Respiratory Diseases',
    1: 'Cardiovascular Diseases',
    2: 'Neurological Disorders',
    3: 'Digestive Disorders',
    4: 'Musculoskeletal Conditions'
}

# Step 7: Map the cluster numbers to cluster names
df['cluster_name'] = df['cluster'].map(cluster_names)

# Step 8: Print the first few rows to see the cluster names
print(df[['diseaseName', 'cluster', 'cluster_name']].head())

cluster
0    [Adenocarcinoma, Amyotrophic Lateral Sclerosis...
1    [Meningioma, Mental disorders, Malignant Neopl...
2    [Glioblastoma, Hepatomegaly, Liver neoplasms, ...
3    [pathologic fistula, Schizophrenia, Truncus Ar...
4    [Cholesteatoma, Diabetes Mellitus, Down Syndro...
Name: diseaseName, dtype: object
                     diseaseName  cluster                cluster_name
0                 Adenocarcinoma        0        Respiratory Diseases
1  Amyotrophic Lateral Sclerosis        0        Respiratory Diseases
2                          Apnea        0        Respiratory Diseases
3                      Arthritis        0        Respiratory Diseases
4                  Cholesteatoma        4  Musculoskeletal Conditions


In [33]:
df.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source,diseaseName_encoded,cluster,cluster_name
0,1,A1BG,0.7,0.538,C0001418,Adenocarcinoma,group,C04,Neoplastic Process,0.01,1.0,2008.0,2008.0,1,0,LHGDN,1855,0,Respiratory Diseases
1,1,A1BG,0.7,0.538,C0002736,Amyotrophic Lateral Sclerosis,disease,C18;C10,Disease or Syndrome,0.01,1.0,2008.0,2008.0,1,0,BEFREE,2533,0,Respiratory Diseases
2,1,A1BG,0.7,0.538,C0003578,Apnea,phenotype,C23;C08,Sign or Symptom,0.01,1.0,2017.0,2017.0,1,0,BEFREE,3074,0,Respiratory Diseases
3,1,A1BG,0.7,0.538,C0003864,Arthritis,disease,C05,Disease or Syndrome,0.01,1.0,2019.0,2019.0,1,0,BEFREE,3209,0,Respiratory Diseases
4,1,A1BG,0.7,0.538,C0008373,Cholesteatoma,disease,C17,Disease or Syndrome,0.01,1.0,2020.0,2020.0,1,0,BEFREE,6371,4,Musculoskeletal Conditions


In [29]:
df.cluster.value_counts()

cluster
1    283538
4    238071
3    227768
2    220183
0    165382
Name: count, dtype: int64

In [28]:
df.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source,diseaseName_encoded,cluster
0,1,A1BG,0.7,0.538,C0001418,Adenocarcinoma,group,C04,Neoplastic Process,0.01,1.0,2008.0,2008.0,1,0,LHGDN,1855,0
1,1,A1BG,0.7,0.538,C0002736,Amyotrophic Lateral Sclerosis,disease,C18;C10,Disease or Syndrome,0.01,1.0,2008.0,2008.0,1,0,BEFREE,2533,0
2,1,A1BG,0.7,0.538,C0003578,Apnea,phenotype,C23;C08,Sign or Symptom,0.01,1.0,2017.0,2017.0,1,0,BEFREE,3074,0
3,1,A1BG,0.7,0.538,C0003864,Arthritis,disease,C05,Disease or Syndrome,0.01,1.0,2019.0,2019.0,1,0,BEFREE,3209,0
4,1,A1BG,0.7,0.538,C0008373,Cholesteatoma,disease,C17,Disease or Syndrome,0.01,1.0,2020.0,2020.0,1,0,BEFREE,6371,4


In [None]:
df = df.drop(["diseaseName","diseaseName_encoded"],axis = 1)

In [11]:
le = LabelEncoder()
cat_var1 = cat_var.apply(le.fit_transform)

In [12]:
cat_var1

Unnamed: 0,geneSymbol,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,source
0,0,52,1855,1,49,23,859
1,0,134,2533,0,613,11,0
2,0,222,3074,2,740,30,0
3,0,244,3209,0,127,11,0
4,0,565,6371,0,572,11,0
...,...,...,...,...,...,...,...
1134937,2907,364,4427,2,49,26,833
1134938,11336,1722,29879,0,49,23,0
1134939,11336,1738,16497,0,49,23,0
1134940,11336,10792,16511,0,684,23,0


In [20]:
data = pd.concat([num_var,cat_var1],axis = 1)

In [21]:
x = data.drop(['diseaseType','NofSnps','EI'],axis = 1)

In [22]:
y = data.diseaseType

In [24]:
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(x, y)

In [25]:
x_train,x_test,y_train,y_test = train_test_split(X_res,y_res,test_size = 0.3,random_state = 23)

In [26]:
cfr = RandomForestClassifier()

In [28]:
model = cfr.fit(x_train[:10000],y_train[:10000])

In [29]:
pred = model.predict(x_test)

In [30]:
pred

array([1, 2, 2, ..., 1, 0, 1])

In [31]:
accuracy_score(y_test,pred)

0.8762061808773776

In [39]:
row = x_test.iloc[34]

In [40]:
row_reshaped = row.values.reshape(1, -1)

In [41]:
model.predict(row_reshaped)



array([1])

In [42]:
y_test.iloc[34]

1

In [43]:
x_test

Unnamed: 0,geneId,DSI,DPI,score,YearInitial,YearFinal,NofPmids,geneSymbol,diseaseId,diseaseName,diseaseClass,diseaseSemanticType,source
1490293,27259,0.413869,0.894945,0.016356,2007.465327,2016.364355,1,7860,635,7055,150,23,0
360323,3952,0.349000,0.846000,0.010000,2017.000000,2017.000000,1,9166,235,5597,812,11,0
1868924,79878,0.633972,0.529311,0.100000,2013.000000,2017.000000,0,20656,1554,14824,1101,21,850
2131965,2357,0.428190,0.846000,0.010000,2018.000000,2018.000000,1,6011,12441,22961,812,11,0
1587122,494327,0.539815,0.782578,0.011883,2018.811690,2018.905845,1,11519,322,18481,1003,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1493886,4374,0.528492,0.699055,0.100000,2013.000000,2017.000000,0,12268,24063,29213,780,15,850
1688606,1058,0.728238,0.548210,0.010000,2016.602224,2016.602224,1,2843,15069,6545,261,11,0
334605,3643,0.432000,0.846000,0.020000,2004.000000,2017.000000,2,8291,3118,28886,49,23,0
1530004,1569,0.514225,0.708083,0.010000,2019.000000,2019.000000,1,4008,14981,20065,79,23,0


In [48]:
y_test.head(20)

1490293    1
360323     2
1868924    2
2131965    2
1587122    1
1093714    1
2192524    2
320835     1
801850     2
1974765    2
254394     2
48645      2
2150310    2
1779157    2
1673528    1
1422287    1
114390     0
2292717    2
17087      0
1810959    2
Name: diseaseType, dtype: int32

In [45]:
row1 = x_test.iloc[360323]
row_reshaped = row1.values.reshape(1, -1)
model.predict(row_reshaped)



array([2])

In [50]:
11439

11439

In [49]:
row1 = x_test.iloc[114390]
row_reshaped = row1.values.reshape(1, -1)
model.predict(row_reshaped)



array([0])

In [51]:
x_test.iloc[34] # 1

geneId                  876.000000
DSI                       0.457413
DPI                       0.838280
score                     0.259831
YearInitial            2005.162872
YearFinal              2005.162872
NofPmids                  1.000000
geneSymbol             2422.000000
diseaseId               224.000000
diseaseName            3033.000000
diseaseClass            167.000000
diseaseSemanticType      11.000000
source                  807.000000
Name: 1506595, dtype: float64

In [52]:
x_test.iloc[360323] # 2

geneId                  7318.000000
DSI                        0.628297
DPI                        0.682514
score                      0.117536
YearInitial             2010.369589
YearFinal               2013.492785
NofPmids                   1.000000
geneSymbol             20273.000000
diseaseId               2754.000000
diseaseName            25791.000000
diseaseClass             755.000000
diseaseSemanticType       30.000000
source                   636.000000
Name: 2261702, dtype: float64

In [53]:
x_test.iloc[114390] # 0

geneId                 54206.000
DSI                        0.612
DPI                        0.538
score                      0.020
YearInitial             2010.000
YearFinal               2014.000
NofPmids                   2.000
geneSymbol              5305.000
diseaseId              15187.000
diseaseName            12338.000
diseaseClass              49.000
diseaseSemanticType       23.000
source                     0.000
Name: 888702, dtype: float64