<a href="https://colab.research.google.com/github/kartiknarayansahoo/deep_learning_specialization/blob/main/jovian-dl/Assignment%202%20-%20Train%20your%20first%20model/jovian_training_first_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import torch
import torchvision
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn.functional as F
from torchvision.datasets.utils import download_url
from torch.utils.data import DataLoader, TensorDataset, random_split

In [2]:
DATASET_URL = "https://gist.github.com/BirajCoder/5f068dfe759c1ea6bdfce9535acdb72d/raw/c84d84e3c80f93be67f6c069cbdc0195ec36acbd/insurance.csv"
DATA_FILENAME = "insurance.csv"
download_url(DATASET_URL, '.')

Using downloaded and verified file: ./insurance.csv


In [3]:
dataframe_raw = pd.read_csv(DATA_FILENAME)
dataframe_raw.head()  

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
def customize_dataset(dataframe_raw, rand_str):
    dataframe = dataframe_raw.copy(deep=True)
    # drop some rows
    dataframe = dataframe.sample(int(0.95*len(dataframe)), random_state=int(ord(rand_str[0])))
    # scale input
    dataframe.bmi = dataframe.bmi * ord(rand_str[1])/100.
    # scale target
    dataframe.charges = dataframe.charges * ord(rand_str[2])/100.
    # drop column
    if ord(rand_str[3]) % 2 == 1:
        dataframe = dataframe.drop(['region'], axis=1)
    return dataframe

In [7]:
dataframe = customize_dataset(dataframe_raw, 'Kartik')

In [8]:
num_rows = len(dataframe)
print(num_rows)

1271


In [9]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1271 entries, 1307 to 661
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1271 non-null   int64  
 1   sex       1271 non-null   object 
 2   bmi       1271 non-null   float64
 3   children  1271 non-null   int64  
 4   smoker    1271 non-null   object 
 5   region    1271 non-null   object 
 6   charges   1271 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 79.4+ KB


In [11]:
num_columns = len(dataframe.columns)
print(num_columns)

7


In [12]:
rows, columns = dataframe.shape
print(rows, columns)

1271 7


In [19]:
input_cols = list(dataframe.columns[:-1])
print(input_cols)

['age', 'sex', 'bmi', 'children', 'smoker', 'region']


In [26]:
categorical_cols = list(dataframe.select_dtypes(include=['object']).columns)
print(categorical_cols)

['sex', 'smoker', 'region']


In [51]:
output_cols = list(dataframe.columns[-1].string)
print(output_cols)

AttributeError: ignored

# Preparing the dataset for training

In [27]:
def dataframe_to_arrays(dataframe):
  # make a copy of the original dataframe
  dataframe1 = dataframe.copy(deep=True)
  # convert non-numeric categorical columns to numbers
  for col in categorical_cols:
    dataframe1[col] = dataframe1[col].astype('category').cat.codes
  # extract input and outputs as numpy arrays
  inputs_array = dataframe1[input_cols].to_numpy()
  targets_array = dataframe1[output_cols].to_numpy()
  return inputs_array, targets_array

In [28]:
inputs_array, targets_array = dataframe_to_arrays(dataframe)
inputs_array, targets_array

(array([[32.     ,  1.     , 27.2764 ,  4.     ,  1.     ,  1.     ],
        [59.     ,  0.     , 34.144  ,  0.     ,  0.     ,  2.     ],
        [37.     ,  0.     , 33.08185,  1.     ,  0.     ,  1.     ],
        ...,
        [57.     ,  0.     , 19.497  ,  1.     ,  0.     ,  3.     ],
        [23.     ,  1.     , 33.368  ,  0.     ,  0.     ,  3.     ],
        [57.     ,  0.     , 23.2606 ,  1.     ,  0.     ,  2.     ]]),
 array([24478.625832 , 13958.76534  ,  6968.082363 , ..., 13716.85164  ,
         2082.60102  , 25299.3783054]))

In [34]:
# converting array dtype to float32 for faster computation on GPU
inputs_array = inputs_array.astype('float32')
targets_array = targets_array.astype('float32')

In [35]:
inputs_array.dtype, targets_array.dtype

(dtype('float32'), dtype('float32'))

In [37]:
from numpy import dtype
inputs = torch.from_numpy(inputs_array)
targets = torch.from_numpy(targets_array)

In [38]:
inputs.dtype, targets.dtype

(torch.float32, torch.float32)

In [39]:
dataset = TensorDataset(inputs, targets)

In [43]:
val_percent = 0.15
val_size = int(num_rows*val_percent)
train_size = num_rows-val_size
train_ds, val_ds = random_split(dataset, lengths = [train_size, val_size])

In [44]:
batch_size = 256

In [45]:
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)

In [48]:
for xb, yb in train_loader:
  print('inputs: ', xb)
  print('outputs: ', yb)
  break

inputs:  tensor([[42.0000,  1.0000, 24.1142,  0.0000,  0.0000,  2.0000],
        [28.0000,  0.0000, 25.1569,  1.0000,  0.0000,  1.0000],
        [58.0000,  0.0000, 27.3685,  0.0000,  0.0000,  1.0000],
        ...,
        [51.0000,  0.0000, 19.9820,  0.0000,  0.0000,  3.0000],
        [47.0000,  0.0000, 43.9604,  1.0000,  0.0000,  2.0000],
        [26.0000,  0.0000, 16.6791,  2.0000,  1.0000,  0.0000]])
outputs:  tensor([ 6802.2515,  4712.3516, 13935.7598, 16505.2969, 10906.3691, 55729.0547,
        14171.2871,  9808.3584,  2254.7090, 23921.8672,  3621.5701,  9748.5352,
         3615.6409, 10516.7920,  1860.1018, 24194.9902, 24712.1035,  8880.1836,
         5869.3501,  2436.9250, 12458.8877,  3255.1987, 22970.2285, 11897.1494,
        42368.2852,  2396.5818,  9068.1494, 50070.1484, 42064.5547,  9707.9648,
         9407.6045, 14948.3691,  6234.6675,  3979.2261,  3507.8889,  5411.6401,
         1832.5615,  2624.6221, 11063.3213, 21887.9355, 22166.4629,  1960.1573,
         3109.2305,  40


# Create a linear regression model

In [50]:
input_size = len(input_cols)
output_size = len(output_cols)
input_size, output_size

(6, 7)