Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

executable file 165 lines (135 sloc) 4.518 kb
#!/usr/bin/python
#-*- coding:utf-8 -*-
import random
"""
The manager responsible for processing the datasets
"""
__author__ = 'Filipe Wanderley Lima (fwl), Juliana Medeiros de Lucena (jml) and Tiago Ferreira Lima (tfl2)'
def get_data(path):
instances = dict()
with open(path) as f:
for line in f:
inst = line.split(',')
klass = inst[-1].rstrip('\n')
inst = map(float, inst[:-1])
inst.append(klass)
if instances.has_key(klass):
instances[klass].append(inst)
else:
instances[klass] = [inst]
return instances
def push_data(data, path):
f = open(path, 'w')
data_compressed = list()
for klass in data.keys():
data_compressed.extend([inst for inst in data[klass]])
for a in range(0, len(data_compressed)):
i = random.randint(0, len(data_compressed) - 1)
inst = data_compressed.pop(i)
for attr in inst:
f.write(str(attr))
if attr == inst[-1]:
f.write('\n')
else:
f.write(',')
def process_data(processed_data, training_percentage):
"""
Divide the dataset in training set and test set based on the percentage
@type processed_data: dict
@param processed_data: Tha data extracted from the dataset
@type training_percentage: int
@param training_percentage: The percentage used to divide the dataset
@rtype: tuple
@returns: The training set and the test set
"""
print training_percentage, '%', 'training and', 100 - training_percentage, '%', 'test'
len_test = 0
test = dict()
training = dict()
for c, l in processed_data.items():
training_quantity = (training_percentage*len(l)) / 100
for i in range(training_quantity):
if i == 0:
training[c] = [l.pop(random.randint(0, len(l)-1))]
else:
training[c].append(l.pop(random.randint(0, len(l)-1)))
test[c] = l
return (training, test)
def get_infos(path):
for i in range(5):
dataset_path = path + "/sub_training_%i" % i
training_path = path + "/training_%i" % i
test_path = path + "/test_%i" % i
print '\nfold %i' % i
print 'database:'
dataset = get_data(dataset_path)
try:
print 'Class 1:', len(dataset['1'])
except KeyError:
print 0
try:
print 'Class 2:', len(dataset['2'])
except KeyError:
print 0
print '\ntraining:'
training = get_data(training_path)
try:
print 'Class 1:', len(training['1'])
except KeyError:
print 0
try:
print 'Class 2:', len(training['2'])
except KeyError:
print 0
print '\ntest:'
test = get_data(test_path)
try:
print 'Class 1:', len(test['1'])
except KeyError:
print 0
try:
print 'Class 2:', len(test['2'])
except KeyError:
print 0
"""
Generates the training and test sets through k-fold cross validation
@type path: str
@param path: The path where the folds are
@type n_folds: int
@param n_folds: The number of folds
"""
def generate_sets(path, n_folds):
folds = [open(path + 'sub_training_%i' % i).readlines() for i in range(n_folds)]
for i, fold in enumerate(folds):
# The current file is the test set
test = open(path + 'test_%i' % i, 'w')
for l in fold:
test.write(l)
# The other files are the training set
train = open(path + 'training_%i' % i, 'w')
# Flattening files lines
lines = sum([f for f in folds if folds.index(f) != i], [])
for l in lines:
train.write(l)
if __name__ == '__main__':
#print get_data('datasets/sub_training_0')
#data = {'1': [[92.0, 80.0, 10.0, 26.0, 20.0, 6.0, '1'],
# [91.0, 68.0, 27.0, 26.0, 14.0, 16.0, '1']],
# '2': [[91.0, 69.0, 25.0, 25.0, 66.0, 8.0, '2'],
# [92.0, 108.0, 53.0, 33.0, 94.0, 12.0, '2'],
# [89.0, 63.0, 24.0, 20.0, 38.0, 0.5, '2']]}
#push_data(data, 'datasets/data')
#folds_data = [get_data('datasets/liver/folds/oss/sub_training_%i' % i)
#for i in range(0, 5)]
#for data in folds_data:
#train, test = process_data(data, 80)
#push_data(train, 'datasets/liver/folds/oss/training_%i' %
#folds_data.index(data))
#push_data(test, 'datasets/liver/folds/oss/test_%i' %
#folds_data.index(data))
#get_infos('datasets/liver/selected/ib2')
for path in ['liver/selected/drop3/', 'liver/selected/hmn_ei/',
'liver/selected/ib2/', 'liver/selected/icf/', 'liver/selected/oss/',
'liver/folds/original/']:
path = 'datasets/' + path
generate_sets(path, 5)
Jump to Line
Something went wrong with that request. Please try again.