In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

#%matplotlib notebook
#%matplotlib widget
import matplotlib 
import numpy as np
import pandas as pd
import os, sys, time, random
#import ipywidgets
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from mpl_toolkits.mplot3d.art3d import Poly3DCollection    
 
# use LaTeX, choose nice some looking fonts and tweak some settings
matplotlib.rc('font', family='serif')
matplotlib.rc('font', size=16)
matplotlib.rc('legend', fontsize=16)
matplotlib.rc('legend', numpoints=1)
matplotlib.rc('legend', handlelength=1.5)
matplotlib.rc('legend', frameon=True)
matplotlib.rc('xtick.major', pad=7)
matplotlib.rc('xtick', direction="in")
matplotlib.rc('ytick', direction="in")
matplotlib.rc('xtick', top = True)
matplotlib.rc('ytick', right =True )
matplotlib.rc('xtick.minor', pad=7)
matplotlib.rc('text', usetex=True)
# matplotlib.rc('text.latex', 
#               preamble=[r'\usepackage[T1]{fontenc}',
#                         r'\usepackage{amsmath}',
#                         r'\usepackage{txfonts}',
#                         r'\usepackage{textcomp}'])

matplotlib.rc('figure', figsize=(12, 9))

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
device = torch.device('cuda:0')

In [20]:
# Tensor 를 CPU 혹은 GPU 에 위치시키가. 
a=torch.tensor([1,2,3,4])
b=torch.tensor([[1,2],[3,4.]], device=device)
print(a.is_cuda, b.is_cuda)

False True


In [58]:
# GPU 와 CPU 의 에서의 난수행렬 생성 및 행렬곱 계산

def cpu_matrix_mult():
    #t0 = time.time()
    matrix_c1 = torch.rand((1000, 1000))
    matrix_c2 = torch.rand((1000, 1000))
    matrix_c3 = matrix_c1@matrix_c2
    #print(time.time()-t0, "secs for matrix multiplication")
    return matrix_c3

def gpu_matrix_mult():
    #t0=time.time()
    matrix_g1 = torch.rand((1000, 1000), device = device)
    matrix_g2 = torch.rand((1000, 1000), device = device)
    matrix_g3 = matrix_g1 @ matrix_g2
    return matrix_g3

def gpu_matrix_mult_from_numpy():
    d1 = np.random.random((1000, 1000))
    d2 = np.random.random((1000, 1000))
    xd1 = torch.from_numpy(d1).float().to(device)
    xd2 = torch.from_numpy(d2).float().to(device)
    xd3 = xd1@xd2
    return xd3
    

In [59]:
%timeit cpu_matrix_mult()
%timeit gpu_matrix_mult()
%timeit gpu_matrix_mult_from_numpy()

17.1 ms ± 13.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
87.9 µs ± 171 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
11.9 ms ± 64.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [66]:
# 1000 X 1000 난수행렬 생성 소요 시간.
# GPU 상의 torch로 생성하는것이 CPU 상의 torch로 생성하는 것보다 590 배 가량 빠르며 numpy를 
# 사용하는 것보다 670배 정도 빠르다.

%timeit np.random.random((1000, 1000))
%timeit torch.rand((1000, 1000))
%timeit torch.rand((1000, 1000), device=device)

3.33 ms ± 19.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.93 ms ± 4.88 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.94 µs ± 46.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [72]:
# 1000 X 1000 난수 행렬곱 계산 시간
# GPU 상의 torch로 계산하는 것이 CPU 상의 torch 로 계산하는 것보다 19배 정도 빠르고, numpy
# 를 사용하는 것보다 44배 정도 빠르다.

n1, n2 = np.random.random((1000, 1000)), np.random.random((1000, 1000))
%timeit n1@n2

m1 = torch.rand((1000, 1000))
m2 = torch.rand((1000, 1000))
%timeit m1@m2

m3, m4 = torch.rand((1000, 1000), device=device), torch.rand((1000, 1000), device=device)
%timeit m3@m4

3.3 ms ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1.42 ms ± 7.06 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
75.7 µs ± 307 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [75]:
# numpy array를 GPU상의 torch.tensor로 변환 할 때, 형변환(float)가 들어가면 수행시간이 대폭 증가한다.
%timeit torch.from_numpy(np.random.random((1000, 1000))).to(device)
%timeit torch.from_numpy(np.random.random((1000, 1000))).float().to(device)
%timeit torch.rand((1000, 1000), device=device)

3.96 ms ± 1.43 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
6.29 ms ± 19 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.92 µs ± 26.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [31]:
c3=cpu_matrix_mult()
print(c3.is_cuda)
g3=gpu_matrix_mult()
print(g3.is_cuda)

False
True


In [34]:
np.random.random((1000, 1000))

array([[0.94925198, 0.91593598, 0.6289162 , ..., 0.67891489, 0.21645577,
        0.81899608],
       [0.41592377, 0.37100587, 0.54598888, ..., 0.9489376 , 0.4611625 ,
        0.26045436],
       [0.3605144 , 0.86983802, 0.76815613, ..., 0.48866185, 0.15687152,
        0.07878043],
       ...,
       [0.51931825, 0.58270631, 0.66755306, ..., 0.08056379, 0.9736298 ,
        0.77114408],
       [0.71625255, 0.32417488, 0.17021884, ..., 0.91369809, 0.74663509,
        0.11026505],
       [0.33236987, 0.95652262, 0.28802287, ..., 0.22659897, 0.84892104,
        0.54240689]])

In [76]:
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


GeForce RTX 3090
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [74]:
3300/75.7

43.59313077939234