In [1]:
import torch

Tensors
- 1D : Vectors 
- 2D : Matrix
- K > 2 : $k_{th}$ order tensor

Benefits of using are clear : 
- Supports automatic differentiation
- GPU accelerated

In [2]:
x = torch.arange(12, dtype=torch.float32)
x.numel() # Number of elements in the tensor
print(f"Shape of the tensor : {x.shape}")
y = x.reshape(3,4)
print(f"Reshaped tensor : {y}")

# Complicated reshape
x = x.reshape(-1, 4) # -1 means infer the size of that dimension with 4 columns
print(f"{x}")

Shape of the tensor : torch.Size([12])
Reshaped tensor : tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]])
tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]])


In [3]:
# Initializing a tensor

torch.zeros((2,3,4)) # 2*3*4 tensor of zeros
torch.ones((2,3,4)) # 2*3*4 tensor of ones
torch.randn((2,3,4)) # Random number from any distribution you want

tensor([[[-0.2067, -0.9582,  0.4294, -1.0971],
         [ 0.4104,  1.4033,  0.3866,  1.5919],
         [-1.8843,  0.4369,  1.4990, -1.6875]],

        [[ 0.7495,  0.1277, -0.3730,  1.1108],
         [ 0.6534,  0.4261, -0.4578,  0.2413],
         [-0.2289,  0.1072, -1.1689, -0.1509]]])

In [4]:
# Indexing and slicing

x = torch.randn((2,3))
print(f"Original tensor : {x}")
x[-1] # Last row
x[:, -1] # Last column
x[0, 1] # First row and second column

# Assignment
x[0,1] = 100
x # Values have changed

# Challenge
x = torch.arange(12).reshape(3,4)
print(f"Original tensor : {x}")
x[0, 1:3] = 100 # 1:3 columns of the first row
x

Original tensor : tensor([[-0.0723,  1.1196,  1.5397],
        [ 0.4552,  2.8176,  0.3035]])
Original tensor : tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])


tensor([[  0, 100, 100,   3],
        [  4,   5,   6,   7],
        [  8,   9,  10,  11]])

In [16]:
# Operations : Element wise
x = torch.arange(12).reshape(3,4)
y = torch.exp(x)
z = x/y

print(f"Original tensor : {x}")
print(f"Exponential tensor : {y}")

# Concatenating : Axis 0/1 : Row/Column
torch.cat((x, y), dim=0) # Concatenate along rows : Just added them below
torch.cat((x,y), dim=1) # Concatenate along columns : Just added them to the right

x.sum()

Original tensor : tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])
Exponential tensor : tensor([[1.0000e+00, 2.7183e+00, 7.3891e+00, 2.0086e+01],
        [5.4598e+01, 1.4841e+02, 4.0343e+02, 1.0966e+03],
        [2.9810e+03, 8.1031e+03, 2.2026e+04, 5.9874e+04]])


tensor(66)

In [25]:
# Broadcasting : When the shapes of the tensors differ
a = torch.arange(4).reshape(1,4)
b = torch.arange(4).reshape(4,1)

mem = a + b # First row added to the first column of b

In [34]:
# Saving Memory
y = torch.arange(12).reshape(3,4)
before = id(y)
y = y + 1 # This will create a new tensor
before, id(y) # Different memory locations

# Every operation the memory changes 
# Problem : Run out of memory & Memory leak

# Solution : Inplace operations
z = torch.zeros_like(y)
id_z = id(z)
z[:] = y + 1 # This will not create a new tensor
print(f"Before : {id_z}, After : {id(z)}") 

# Using z[:], we updated it in the same location

Before : 5635341472, After : 5635341472


In [38]:
# Conversion
x = torch.arange(12).reshape(3,4)
y = x.numpy() # Convert to numpy

type(x), type(y)

# More 
a = torch.tensor([3.5])
a, a.item(), float(a), int(a)


(tensor([3.5000]), 3.5, 3.5, 3)

In [46]:
# Question 1 
x = torch.arange(12).reshape(3,4)
y = torch.randn((3,4))

x > y # We get the same tensor

# Question 2
x = torch.arange(12).reshape(3,4)
y = torch.arange(4).reshape(1,4)

x + y


tensor([[ 0,  2,  4,  6],
        [ 4,  6,  8, 10],
        [ 8, 10, 12, 14]])

So basically in broadcasting, we expand the dimension to make it compatible like with 1 x 4 and 4 x 1 we expand both to 4 x 4 and using that we are doing <br>
$matmul[i,j] = x[0,j] * y[i,0]$

### Data Preprocessing

In [52]:
# Reading the dataset 
import pandas as pd
path = "/Users/krishuagarwal/Desktop/Programming/python/deep-learning/notebook/preprocessing.csv"
df = pd.read_csv(path)

df

Unnamed: 0,NumRooms,RoofType,Price
0,,,127500
1,2.0,,106000
2,4.0,Slate,178100
3,,,140000


In [65]:
# Data Preparation : Response and Feature seperation
# iloc and loc

# Imputation : Replaces missing values with estimates of their values
inputs, targets = df.iloc[:, 0:2], df.iloc[:,2]
inputs = pd.get_dummies(inputs, dummy_na=True) # We got the dummies of NaN
inputs

# Filling the missing values
inputs = inputs.fillna(inputs.mean()) # Mean / Mode / Median

In [73]:
# Conversion : pandas -> numpy -> tensors
x = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(targets.to_numpy(dtype=float))
print(f"X : {x}")
print(f"Y : {y}")

X : tensor([[3., 0., 1.],
        [2., 0., 1.],
        [4., 1., 0.],
        [3., 0., 1.]], dtype=torch.float64)
Y : tensor([127500., 106000., 178100., 140000.], dtype=torch.float64)


Complexities : 
1. Gathering Data from mulitple sources
2. Myraid of data types beyond categorical and numerical : Text, Image, Audio
3. Real World : 
   1. Outliers + Faulty measurements & recording errors

#### Exercises

In [86]:
# UCI Has a lot of datasets

data = pd.read_csv("/Users/krishuagarwal/Desktop/Programming/python/deep-learning/notebook/bank-full.csv",  sep=';', quotechar='"')
data.head()

data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [91]:
data.isna().sum() # Check for missing values
# No Missing values

# Indexing
data.loc[:, "age" : "balance"] , data.columns

# For large databases we can load them in chunks or use dask / fire
# Large categories : We will use category encodings?
# Alternatives : Parallel Processing, Chunking 

(       age           job   marital  education default  balance
 0       58    management   married   tertiary      no     2143
 1       44    technician    single  secondary      no       29
 2       33  entrepreneur   married  secondary      no        2
 3       47   blue-collar   married    unknown      no     1506
 4       33       unknown    single    unknown      no        1
 ...    ...           ...       ...        ...     ...      ...
 45206   51    technician   married   tertiary      no      825
 45207   71       retired  divorced    primary      no     1729
 45208   72       retired   married  secondary      no     5715
 45209   57   blue-collar   married  secondary      no      668
 45210   37  entrepreneur   married  secondary      no     2971
 
 [45211 rows x 6 columns],
 Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
        'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
        'previous', 'poutcome', 'y'],
       dty