In [1]:
# 设置参数
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import psutil
%reload_ext autoreload
%autoreload 2

imgDir = './images/resized/'
savedDataDir = './SavedTraining'
saveDataFilePath = 'savedTrainingData.bin'
labels_path = 'labels_final.csv'

loadLastTrainData = True
sliceRandomSeed = -1
trainSetPercent = 0.5
learning_rate = 0.01
num_loops = 10
num_iterations = 100

if sliceRandomSeed > 0: np.random.seed = sliceRandomSeed

In [2]:
import utils

if loadLastTrainData: loadedData = utils.loadFromeFile(saveDataFilePath)
else: loadedData = None
if loadedData is None:
    loadedData = {
        'w': np.zeros((dataSet.shape[0],1), dtype="float64"),
        'b': 0.,
        'testAccuracy': 0.,
        'costs': [0.]
    }
else:
    print('last train accuracy: %.2f %%' % loadedData['trainAccuracy'])
    print('last test accuracy: %.2f %%' % loadedData['testAccuracy'])


w = loadedData['w']
b = loadedData['b']
lastCost = loadedData['costs'][-1]
lastTestAccuracy = loadedData['testAccuracy']

labels= pd.read_csv(labels_path)
print('data set count:', labels.shape[0])

memUsage = psutil.virtual_memory()
print('memory used %.3f GB of %.3f GB | %.0f %%' % (memUsage.used / 1024 / 1024 / 1024, memUsage.total / 1024 / 1024 / 1024, memUsage.percent))

last train accuracy: 84.56 %
last test accuracy: 85.19 %
data set count: 5996
memory used 17.759 GB of 63.732 GB | 28 %


In [3]:
trainSetCount = math.floor(labels.shape[0] * trainSetPercent)
if trainSetCount < 1: trainSetCount = 1
testSetCount = labels.shape[0] - trainSetCount

print('train set count:', trainSetCount)
print('test set count:', testSetCount)

imgDir = './images/resized/'
imgSize = 0

np.random.seed = sliceRandomSeed
shuffledDataSet = np.array(labels)
# np.random.shuffle(shuffledDataSet)
trainSetRange = shuffledDataSet[:trainSetCount, :]
testSetRange = shuffledDataSet[trainSetCount:trainSetCount + testSetCount, :]

trainSetX = []

for row in trainSetRange:
    img = plt.imread(imgDir + str(row[0]) + '.jpg')
    if imgSize == 0: imgSize = img.size
    elif imgSize != img.size:
        raise ValueError("图片尺寸不一致")
    imgArray = np.array(img)
    imgTrans = imgArray.reshape((1, img.size)).T
    trainSetX.append(imgTrans)

print('image size:', imgSize)

# 构造训练集
trainSetX = np.array(trainSetX)
trainSetX = trainSetX.squeeze().T / 2550
trainSetY = trainSetRange[:,1:].T

# 构造测试集
testSetX = []

for row in testSetRange:
    img = plt.imread(imgDir + str(row[0]) + '.jpg')
    if imgSize != img.size:
        raise ValueError("图片尺寸不一致")
    imgArray = np.array(img)
    imgTrans = imgArray.reshape((1, img.size)).T
    testSetX.append(imgTrans)

testSetX = np.array(testSetX).squeeze().T / 2550
testSetY = testSetRange[:,1:].T

memUsage = psutil.virtual_memory()
print('memory used %.3f GB of %.3f GB | %.0f %%' % (memUsage.used / 1024 / 1024 / 1024, memUsage.total / 1024 / 1024 / 1024, memUsage.percent))

train set count: 2998
test set count: 2998
image size: 331776
memory used 32.252 GB of 63.732 GB | 51 %


In [3]:
# 准备数据

imgSize = 0


trainSetCount = math.floor(labels.shape[0] * trainSetPercent)
if trainSetCount < 1: trainSetCount = 1
testSetCount = labels.shape[0] - trainSetCount

print('train set count:', trainSetCount)
print('test set count:', testSetCount)


dataSet = []
for row in labels.iterrows():
    img = plt.imread(imgDir + str(row[1]['user_id']) + '.jpg')
    if imgSize == 0: imgSize = img.size
    elif imgSize != img.size:
        raise ValueError("图片尺寸不一致")
    imgArray = np.vstack([np.array(img, dtype="float64").reshape((1, img.size)).T / 2550, row[1]['gender']])
    dataSet.append(imgArray)
del labels

dataSet = np.array(dataSet, dtype='float64').squeeze()
print(type(dataSet))
print(dataSet.shape)

memUsage = psutil.virtual_memory()
print('memory used %.3f GB of %.3f GB | %.0f %%' % (memUsage.used / 1024 / 1024 / 1024, memUsage.total / 1024 / 1024 / 1024, memUsage.percent))

train set count: 2998
test set count: 2998
<class 'numpy.ndarray'>
(5996, 331777)
memory used 31.142 GB of 63.732 GB | 49 %


In [4]:

np.random.shuffle(dataSet)
print(dataSet.shape)

memUsage = psutil.virtual_memory()
print('memory used %.3f GB of %.3f GB | %.0f %%' % (memUsage.used / 1024 / 1024 / 1024, memUsage.total / 1024 / 1024 / 1024, memUsage.percent))

(5996, 331777)
memory used 30.801 GB of 63.732 GB | 48 %


In [5]:
# def sliceData(dataSet, trainSetCount):
    # """

    # """
    # trainSetCount = math.floor(labels.shape[0] * trainSetPercent)
    # if trainSetCount < 1: trainSetCount = 1
    # testSetCount = labels.shape[0] - trainSetCount
dataSetTrans = dataSet.T

trainSetX = dataSetTrans[:-1, :trainSetCount]
trainSetY = dataSetTrans[-1:, :trainSetCount]#.astype('int64')
testSetX = dataSetTrans[:-1, trainSetCount:]
testSetY = dataSetTrans[-1:, trainSetCount:]#.astype('int64')
# return trainSetX, trainSetY, testSetX, testSetY

memUsage = psutil.virtual_memory()
print('memory used %.3f GB of %.3f GB | %.0f %%' % (memUsage.used / 1024 / 1024 / 1024, memUsage.total / 1024 / 1024 / 1024, memUsage.percent))

memory used 30.825 GB of 63.732 GB | 48 %


In [12]:
# trainSetX, trainSetY, testSetX, testSetY = sliceData(dataSet.T, trainSetCount)
print('trainSetX.shape', trainSetX.shape)
print('trainSetY.shape', trainSetY.shape)
print('testSetX.shape', testSetX.shape)
print('testSetY.shape', testSetY.shape)
print('w.shape:', w.shape)
print('trainSetX data type:', trainSetX.dtype)
print('trainSetY data type:', trainSetY.dtype)
print('testSetX data type:', testSetX.dtype)
print('testSetY data type:', testSetY.dtype)
print('w data type:', w.dtype)
print('b data type:', b.dtype)

trainSetX.shape (331776, 2998)
trainSetY.shape (1, 2998)
testSetX.shape (331776, 2998)
testSetY.shape (1, 2998)
w.shape: (331776, 1)
trainSetX data type: float64
trainSetY data type: float64
testSetX data type: float64
testSetY data type: float64
w data type: float64
b data type: float64


In [6]:
import logicRegression
memUsage = psutil.virtual_memory()
print('memory used %.3f GB of %.3f GB | %.0f %%' % (memUsage.used / 1024 / 1024 / 1024, memUsage.total / 1024 / 1024 / 1024, memUsage.percent))
result = logicRegression.modelWithInitialWB(
    trainSetX, trainSetY,
    testSetX, testSetY,
    w, b,
    num_iterations, learning_rate,
    cost_record_cnt = 100, print_cost = True)

# deltaCost = result['costs'][-1] - lastCost
# deltaTestAccuracy = result['testAccuracy'] - lastTestAccuracy
# print('delta test accuracy: %.4f' % deltaTestAccuracy)
# print('delta cost: .4f' % deltaCost)

    
# # Plot learning curve (with costs)
# costs = np.squeeze(result['costs'])
# plt.rcParams['figure.figsize'] = [30, 5]
# plt.rcParams['figure.dpi'] = 72
# plt.plot(costs)
# plt.ylabel('cost')
# plt.xlabel('iterations')
# plt.title("Learning rate =" + str(result["learning_rate"]))
# plt.show()

memory used 30.869 GB of 63.732 GB | 48 %
Progress: [....................] cost: 0.3592
Progress: [....................] cost: 0.3590


KeyboardInterrupt: 

In [5]:
# 保存模型
saveData = {
    'w': result['w'],
    'b': result['b'],
    'costs': result['costs'],
    'trainAccuracy': result['trainAccuracy'],
    'testAccuracy': result['testAccuracy']

}
utils.save2File(saveData, saveDataFilePath)
print('save to', saveDataFilePath, 'done')

save to savedTrainingData.bin done
