In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
%matplotlib inline

# Train Data

In [2]:
# Loads Data Dorothea
def LoadaDorothea(filename):
#filename = 'dorothea_train.data'
    filein = open(filename, 'r')
    filelines = filein.readlines()
    filein.close()

    n_elements = len(filelines)
    n_features = 100000

    data = np.zeros((n_elements, n_features))

    for i in range(n_elements):
        nums = filelines[i].split()
        for n in nums:
            j = int(n) - 1
            data[i,j] = 1
    return data
train = LoadaDorothea("./dorothea_train.data")

In [3]:
# Loads the labels file for train data
train_labels = np.loadtxt("./dorothea_train.labels")

In [4]:
# Defines divides train into Training(only) set and Test set
test = train[:100]
test_labels = train_labels[:100]
train = train[100:]
train_labels = train_labels[100:]

In [5]:
# Verifying data format
print(type(train),type(train_labels))
print(type(test),type(test_labels))
print(test.shape,test_labels.shape)
print(train.shape,train_labels.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(100, 100000) (100,)
(700, 100000) (700,)


# The Forest

In [6]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(train, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

# Validation Data

In [7]:
# Loads Test Data Dorothea with python engine to fill blank spaces as NaN
valid = LoadaDorothea("./dorothea_valid.data")

In [8]:
# Loads the labels file for Test data
valid_labels = np.loadtxt("./dorothea_valid.labels")


In [9]:
# Verifying data format
print(type(valid),type(valid_labels))
print(valid.shape,valid_labels.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(350, 100000) (350,)


# The Prediction first instance__ Comparing with valid data and test data

In [10]:
predictions_valid = np.zeros(len(valid))
predictions_test = np.zeros(len(test))

In [11]:
for i in range(len(predictions_valid)):
    predictions_valid[i] = rf.predict(valid[i].reshape(1,-1))
for i in range(len(predictions_test)):
    predictions_test[i] = rf.predict(test[i].reshape(1,-1))

In [12]:
#plt.plot(predictions)

In [13]:
f_valid = len(predictions_valid[predictions_valid == valid_labels])/len(valid_labels)
f_test = len(predictions_test[predictions_test == test_labels])/len(test_labels)

In [14]:
print(f_valid,(f_valid-0.5)*20)
print(f_test,(f_test-0.5)*20)

0.9342857142857143 8.685714285714285
0.95 9.0


# Important Variables first 50

In [15]:
ii = np.argsort(rf.feature_importances_)[::-1][:50]
print("Number of positive values: ", len(rf.feature_importances_[rf.feature_importances_> 0]))

for name,value in zip( ii,rf.feature_importances_[ii]):
    if (value >= 0.000):
        print(name, value)

Number of positive values:  3612
25761 0.0122020434497
59959 0.00785280756692
411 0.0060661701154
28051 0.00596552158593
55095 0.00505632816711
2525 0.00488806922843
80018 0.00483872718413
21754 0.00475598629698
44379 0.00398666190532
89869 0.00389286470026
5678 0.00384795342749
30727 0.0037882961802
98300 0.00360418093815
72451 0.00337383029374
15129 0.00326106927099
31353 0.00324076041059
80130 0.00310654873849
10476 0.00305552371995
50183 0.00302729055044
31015 0.00301292068561
33875 0.00300959865873
5056 0.00298291795767
8517 0.00291605805854
30973 0.00275521668302
36925 0.00270779003776
3163 0.00257028887485
31462 0.00251651069757
45473 0.00249534679996
19477 0.00248656920697
21415 0.00233906618617
90422 0.00228943143312
16824 0.00224762511142
15893 0.00217252569704
50521 0.00212865654643
19338 0.00199182176555
41301 0.00194273692229
79704 0.00192353429476
86845 0.00190512493936
72414 0.00187900075167
39016 0.00186521374345
64845 0.0018492352182
84746 0.00183948066498
76833 0.0017

# As we know most of the parameters are baits, we are going to select only the first 50.000 in order of importance

In [16]:
ii = np.argsort(rf.feature_importances_)[::-1][:4000]
# Modifies test, train, valid data
train = train[:,ii]
test = test[:,ii]
valid = valid[:,ii]

# We re-Train the random forest


In [17]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(train, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=40, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [18]:
for i in range(len(predictions_valid)):
    predictions_valid[i] = rf.predict(valid[i].reshape(1,-1))
for i in range(len(predictions_test)):
    predictions_test[i] = rf.predict(test[i].reshape(1,-1))
##############################
f_valid = len(predictions_valid[predictions_valid == valid_labels])/len(valid_labels)
f_test = len(predictions_test[predictions_test == test_labels])/len(test_labels)
#############################
print(f_valid,(f_valid-0.5)*20)
print(f_test,(f_test-0.5)*20)

0.9371428571428572 8.742857142857144
0.96 9.2


In [24]:
ii = np.argsort(rf.feature_importances_)[::-1][:50]
for name,value in zip( ii,rf.feature_importances_[ii]):
    if (value >= 0.00):
        print(name, value)

0 0.023992629744
3 0.0187190807214
29 0.0172532696508
2 0.015990848978
7 0.015761502832
1 0.015642092971
157 0.0126670275215
46 0.0119141724882
10 0.0115935372195
13 0.0102959784483
16 0.0101957737115
9 0.0100721736287
40 0.00997694086052
205 0.00987019752411
27 0.0082727232143
71 0.00819102534546
11 0.00777624771149
33 0.00772422776766
22 0.00730973046156
105 0.00687612068482
562 0.00663067702427
305 0.00647010435575
15 0.00644244419209
138 0.00610639273562
14 0.00595548676337
38 0.00583729450245
135 0.00583506623019
42 0.00579730278486
325 0.0055801186549
24 0.00543400513713
65 0.00519375655452
180 0.00517104155178
12 0.00500184400732
97 0.00481126779331
322 0.00472761887803
78 0.00468875702166
724 0.00464957548623
148 0.00442122968618
567 0.00421010729988
1402 0.00418216639536
52 0.00378007141554
69 0.00368705438068
55 0.00366699208572
66 0.00351443320356
177 0.00345586782016
75 0.00344073320647
86 0.00324647856207
79 0.00320549663814
226 0.00316980166653
150 0.00311247319414


In [19]:
len(rf.feature_importances_[rf.feature_importances_>= 0])

4000