In [10]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle

In [11]:
#Load Data into dataframe
df = pd.read_csv('./checksWithTargets.txt', sep="\t")
df2 = df[:250]
#Shuffle dataframe
df = shuffle(df)
df2

Unnamed: 0,CUI1,CUI2,Disease1,Disease2,PubMed1,PubMed2,Common_PubMed_IDs,Target,Name Relations,Paper Relations,Concept Relation,ID Relation,Num Relations,Num Atoms
0,C3889305,C3891922,HEMOGLOBIN BRIGHAM PHENOTYPE,HEMOGLOBIN A(2) CANADA PHENOTYPE,14,8,2,3,2,25,0,0,0,1
1,C0015503,C0015526,Factor VII Deficiency,Factor XII Deficiency,1829,871,110,2,1,12,0,1,0,25
2,C0699743,C1846672,Congenital muscular dystrophy,"MUSCULAR DYSTROPHY, LIMB-GIRDLE, TYPE 2I",3138,17,4,2,2,23,2,0,2,11
3,C0009952,C4016378,Febrile Convulsions,"EPILEPTIC ENCEPHALOPATHY, EARLY INFANTILE, 11,...",5067,10,2,5,0,20,0,0,0,1
4,C1876175,C3469522,Ataxia-Telangiectasia Variant,"BREAST CANCER, SUSCEPTIBILITY TO",267,12696,32,3,0,11,0,1,0,6
5,C0265253,C0265279,Stickler syndrome,Kniest dysplasia,344,78,11,3,0,14,2,2,1,24
6,C0085390,C1510426,Li-Fraumeni Syndrome,"choroid plexus carcinoma, childhood",1216,39,8,8,0,20,2,0,1,4
7,C0018553,C1868081,"Hamartoma Syndrome, Multiple",Juvenile Polyposis Coli,1768,280,35,8,0,12,0,0,0,2
8,C0403553,C1857662,Renal dysplasia and retinal aplasia,COACH syndrome,108,27,3,7,0,11,0,0,0,19
9,C0006142,C4015968,Malignant neoplasm of breast,RECLASSIFIED - VARIANT OF UNKNOWN SIGNIFICANCE,309961,6,1,8,0,16,0,0,0,25


In [12]:
#Split into training set (75%) and testing set (25%)
cutoff_percentage = .9
num_training_samples = int(np.floor(df.shape[0]*cutoff_percentage))
num_testing_samples = df.shape[0]-num_training_samples

training_df = df[:num_training_samples]
#testing_df = df[-num_testing_samples:]
testing_df = df2

testing_df

Unnamed: 0,CUI1,CUI2,Disease1,Disease2,PubMed1,PubMed2,Common_PubMed_IDs,Target,Name Relations,Paper Relations,Concept Relation,ID Relation,Num Relations,Num Atoms
0,C3889305,C3891922,HEMOGLOBIN BRIGHAM PHENOTYPE,HEMOGLOBIN A(2) CANADA PHENOTYPE,14,8,2,3,2,25,0,0,0,1
1,C0015503,C0015526,Factor VII Deficiency,Factor XII Deficiency,1829,871,110,2,1,12,0,1,0,25
2,C0699743,C1846672,Congenital muscular dystrophy,"MUSCULAR DYSTROPHY, LIMB-GIRDLE, TYPE 2I",3138,17,4,2,2,23,2,0,2,11
3,C0009952,C4016378,Febrile Convulsions,"EPILEPTIC ENCEPHALOPATHY, EARLY INFANTILE, 11,...",5067,10,2,5,0,20,0,0,0,1
4,C1876175,C3469522,Ataxia-Telangiectasia Variant,"BREAST CANCER, SUSCEPTIBILITY TO",267,12696,32,3,0,11,0,1,0,6
5,C0265253,C0265279,Stickler syndrome,Kniest dysplasia,344,78,11,3,0,14,2,2,1,24
6,C0085390,C1510426,Li-Fraumeni Syndrome,"choroid plexus carcinoma, childhood",1216,39,8,8,0,20,2,0,1,4
7,C0018553,C1868081,"Hamartoma Syndrome, Multiple",Juvenile Polyposis Coli,1768,280,35,8,0,12,0,0,0,2
8,C0403553,C1857662,Renal dysplasia and retinal aplasia,COACH syndrome,108,27,3,7,0,11,0,0,0,19
9,C0006142,C4015968,Malignant neoplasm of breast,RECLASSIFIED - VARIANT OF UNKNOWN SIGNIFICANCE,309961,6,1,8,0,16,0,0,0,25


In [13]:
#Construct Feature Matrix and Target List from Training Set
features_list = ['Common_PubMed_IDs', 'Name Relations', 
                 'Paper Relations', 'Concept Relation', 'ID Relation', 
                 'Num Relations', 'Num Atoms']
targets_list = ['Target']

features_df = training_df[features_list]
features_matrix = features_df.as_matrix()

target_df = training_df[targets_list]
target_list = target_df['Target'].tolist()

  


In [14]:
#Finding classify that best fits the data to the target value (ITS MAGIC!)
clf = RandomForestClassifier(n_estimators=200,random_state=1)
clf.fit(features_matrix, target_list)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [15]:
#Printing Feature Importance
feature_importance = clf.feature_importances_.tolist()
for i,feature in enumerate(features_list):
    print("%s: %f" % (feature, feature_importance[i]) )

#Printing Accuracy on TRAINING SET
accuracy = clf.score(features_matrix, target_list)
print( "\nAccuracy on TRAINING SET: %f" % (accuracy))

Common_PubMed_IDs: 0.280344
Name Relations: 0.062952
Paper Relations: 0.233434
Concept Relation: 0.036244
ID Relation: 0.060507
Num Relations: 0.123080
Num Atoms: 0.203438

Accuracy on TRAINING SET: 0.940000


In [16]:
#Checking which rows gave incorrect predictions
training_prediction = clf.predict(features_matrix).tolist()
rows_different = []
prediction = []
for i,val_1 in enumerate(target_list):
    if np.absolute(val_1 - training_prediction[i]) > 1:
        rows_different.append(i)
        prediction.append(training_prediction[i])
        

training_different_df = training_df.iloc[rows_different]
training_different_df['Prediction'] = prediction
training_different_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,CUI1,CUI2,Disease1,Disease2,PubMed1,PubMed2,Common_PubMed_IDs,Target,Name Relations,Paper Relations,Concept Relation,ID Relation,Num Relations,Num Atoms,Prediction
455,C1833921,C3280042,Familial medullary thyroid carcinoma,RETINITIS PIGMENTOSA 62,398,137,0,6,0,0,2,0,1,3,10
485,C3889858,C4284511,HEMOGLOBIN RADCLIFFE PHENOTYPE,HEMOGLOBIN DIE PHENOTYPE,0,0,0,9,2,0,0,0,0,1,3
387,C0265234,C3551443,Branchio-Oto-Renal Syndrome,ANTERIOR SEGMENT ANOMALIES WITH OR WITHOUT CAT...,317,70133,5,8,0,1,0,0,0,1,10
456,C0392514,C4016102,Hereditary hemochromatosis,"HYPOGONADOTROPIC HYPOGONADISM 2 WITH ANOSMIA, ...",9950,4,0,7,0,0,0,0,0,1,10
411,C1721007,C1840427,"Pachyonychia Congenita, Type 2","PALMOPLANTAR KERATODERMA, EPIDERMOLYTIC, WITH ...",450,9,1,4,0,11,0,0,0,4,6
475,C1838647,C4017040,RETINITIS PIGMENTOSA 12,FH LONDON 3 PHENOTYPE,10,23,0,4,0,0,0,0,0,1,10
65,C1283400,C4017556,Butyrylcholinesterase deficiency,CHE*539T PHENOTYPE,54,491394,7,7,0,12,0,0,0,0,5
434,C0004245,C1828221,Atrioventricular Block,Non dystrophic myotonia,12755,74,0,4,0,0,0,0,0,6,8
495,C4017471,C4017524,BETA-E-THALASSEMIA,HEMOGLOBIN DRENTHE PHENOTYPE,1,0,0,8,0,0,0,1,0,1,3
470,C1865871,C3711381,"HEMANGIOMA, CAPILLARY INFANTILE",Hereditary Diffuse Leukoencephalopathy with Sp...,98,83,0,10,0,0,0,0,0,5,8


In [18]:
#Predicting Data on Testing Set

#Creating a feature matrix of the testing set
testing_features_df = testing_df[features_list]

testing_features_matrix = testing_features_df.as_matrix()

#Creating a target list of testing set
targets_list = ['Target']
testing_targets_df = testing_df[targets_list]
testing_target_list = testing_targets_df['Target'].tolist()

#Predict values
testing_prediction = clf.predict(testing_features_matrix).tolist()

#Checking which rows gave incorrect predictions
rows_different = []
prediction = []
for i,val_1 in enumerate(testing_target_list):
    if np.absolute(val_1 - testing_prediction[i]) > 3:
        rows_different.append(i)
        prediction.append(testing_prediction[i])
        

testing_different_df = testing_df.iloc[rows_different]
testing_different_df['Prediction'] = prediction
testing_different_df
df2['Alogrithm Entry'] = testing_prediction
df2
print len(testing_different_df)
df2

6


  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,CUI1,CUI2,Disease1,Disease2,PubMed1,PubMed2,Common_PubMed_IDs,Target,Name Relations,Paper Relations,Concept Relation,ID Relation,Num Relations,Num Atoms,Alogrithm Entry
0,C3889305,C3891922,HEMOGLOBIN BRIGHAM PHENOTYPE,HEMOGLOBIN A(2) CANADA PHENOTYPE,14,8,2,3,2,25,0,0,0,1,3
1,C0015503,C0015526,Factor VII Deficiency,Factor XII Deficiency,1829,871,110,2,1,12,0,1,0,25,2
2,C0699743,C1846672,Congenital muscular dystrophy,"MUSCULAR DYSTROPHY, LIMB-GIRDLE, TYPE 2I",3138,17,4,2,2,23,2,0,2,11,2
3,C0009952,C4016378,Febrile Convulsions,"EPILEPTIC ENCEPHALOPATHY, EARLY INFANTILE, 11,...",5067,10,2,5,0,20,0,0,0,1,5
4,C1876175,C3469522,Ataxia-Telangiectasia Variant,"BREAST CANCER, SUSCEPTIBILITY TO",267,12696,32,3,0,11,0,1,0,6,2
5,C0265253,C0265279,Stickler syndrome,Kniest dysplasia,344,78,11,3,0,14,2,2,1,24,3
6,C0085390,C1510426,Li-Fraumeni Syndrome,"choroid plexus carcinoma, childhood",1216,39,8,8,0,20,2,0,1,4,8
7,C0018553,C1868081,"Hamartoma Syndrome, Multiple",Juvenile Polyposis Coli,1768,280,35,8,0,12,0,0,0,2,6
8,C0403553,C1857662,Renal dysplasia and retinal aplasia,COACH syndrome,108,27,3,7,0,11,0,0,0,19,7
9,C0006142,C4015968,Malignant neoplasm of breast,RECLASSIFIED - VARIANT OF UNKNOWN SIGNIFICANCE,309961,6,1,8,0,16,0,0,0,25,8


In [19]:
df2.to_csv('./finalResults.txt', sep = "\t")

In [45]:
print("TESTING Accuracy: %f" % ((len(testing_target_list) - len(rows_different))/len(testing_target_list)))

TESTING Accuracy: 0.000000
