-
Notifications
You must be signed in to change notification settings - Fork 6
/
data_pre.py
104 lines (90 loc) · 3.63 KB
/
data_pre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# coding=utf-8
from keras.utils import to_categorical
from keras.datasets import mnist,fashion_mnist,cifar10,imdb
import random
from sklearn.metrics import confusion_matrix
import numpy as np
from keras.preprocessing.sequence import pad_sequences
def load_famnist():
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
x_train = x_train.reshape(-1, 28, 28, 1)
y_train = y_train.reshape(y_train.shape[0], )
x_test = x_test.reshape(-1, 28, 28, 1)
y_test = y_test.reshape(y_test.shape[0], )
x_train = x_train / 255.
x_test = x_test / 255.
# print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
return x_train, y_train, x_test, y_test
def load_mnist():
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(-1, 28, 28, 1)
y_train = y_train.reshape(y_train.shape[0], )
x_test = x_test.reshape(-1, 28, 28, 1)
y_test = y_test.reshape(y_test.shape[0], )
x_train = x_train / 255.
x_test = x_test / 255.
# print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
return x_train, y_train, x_test, y_test
def load_cifar10():
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.reshape(-1, 32, 32, 3)
y_train = y_train.reshape(y_train.shape[0], )
x_test = x_test.reshape(-1, 32, 32, 3)
y_test = y_test.reshape(y_test.shape[0], )
x_train = x_train / 255.
x_test = x_test / 255.
# print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
return x_train, y_train, x_test, y_test
def load_imdb():
config = [5000, 500]
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=config[0])
x_train = pad_sequences(x_train, maxlen=config[1])
x_test = pad_sequences(x_test, maxlen=config[1])
return x_train, y_train, x_test, y_test
def load_data(data_name):
if data_name == 'famnist':
x_train, y_train, x_test, y_test = load_famnist()
elif data_name == 'mnist':
x_train, y_train, x_test, y_test = load_mnist()
elif data_name == 'cifar10':
x_train, y_train, x_test, y_test = load_cifar10()
else:
x_train, y_train, x_test, y_test = load_imdb()
return x_train, y_train, x_test, y_test
def get_imb_data(x_train, y_train, x_test, y_test, imb_rate, min_class, maj_class):
maj_x_train = []
maj_y_train = []
min_x_train = []
min_y_train = []
# print(min_class, maj_class)
for i in range(len(y_train)):
if y_train[i] in min_class:
min_x_train.append(x_train[i])
min_y_train.append(0)
if y_train[i] in maj_class:
maj_x_train.append(x_train[i])
maj_y_train.append(1)
#
min_len = int(len(maj_y_train) * imb_rate)
new_x_train = maj_x_train + min_x_train[:min_len]
new_y_train = maj_y_train + min_y_train[:min_len]
# print(len(new_y_train),len(new_y_train))
# print(len(maj_x_train))
new_x_test = []
new_y_test = []
for i in range(len(y_test)):
if y_test[i] in min_class:
new_x_test.append(x_test[i])
new_y_test.append(0)
if y_test[i] in maj_class:
new_x_test.append(x_test[i])
new_y_test.append(1)
new_x_train = np.array(new_x_train)
new_y_train = np.array(new_y_train)
new_x_test = np.array(new_x_test)
new_y_test = np.array(new_y_test)
idx = [i for i in range(len(new_y_train))]
np.random.shuffle(idx)
new_x_train = new_x_train[idx]
new_y_train = new_y_train[idx]
return new_x_train, new_y_train, new_x_test, new_y_test