# Pytorch programming
Jay Hineman

## Overview of pytorch
Good summary from github: [README.md](https://github.com/pytorch/pytorch)

> PyTorch is a Python package that provides two high-level features:
> - Tensor computation (like NumPy) with strong GPU acceleration
> - Deep neural networks built on a tape-based autograd system
> 
> You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend > PyTorch when needed.

## Activity
1. We'll try at fare prediction again from the taxi cab data.
2. We'll use couple possible inputs to make our dnn a little more flavorful.

*As with pervious, my top goal is to introduce programming and software patterns. I'll continue to use the nyc taxi data, because I think it illustrates the challenges that you have to write software around.*

## Other Resources
* [Introduction to PyTorch](https://pytorch.org/tutorials/beginner/basics/intro.html)

In [1]:
# borrowing and building from last time

import pandas as pd
from sklearn.model_selection import train_test_split

def raw_taxi_df(filename: str) -> pd.DataFrame:
    """Load raw taxi dataframe from parquet"""
    return pd.read_parquet(path=filename)

def clean_taxi_df(raw_df: pd.DataFrame) -> pd.DataFrame:
    """Make a clean taxi DataFrame that throws out non-numerical or outlying numerical values"""
    # drop nans
    clean_df = raw_df.dropna()
    # remove trips longer than 100
    clean_df = clean_df[clean_df["trip_distance"] < 100]
    # add columns for travel time deltas and time minutes
    clean_df["time_deltas"] = clean_df["tpep_dropoff_datetime"] - clean_df["tpep_pickup_datetime"]
    clean_df["time_mins"] = pd.to_numeric(clean_df["time_deltas"]) / 6**10
    return clean_df

def split_taxi_data(clean_df: pd.DataFrame, 
                    x_columns: list[str], 
                    y_column: str, 
                    train_size: int) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Split an x, y dataset selected from a clean dataframe; return x_train, y_train, x_test, y_test"""
    return train_test_split(clean_df[x_columns], clean_df[[y_column]], train_size=train_size)   

In [2]:
raw_df = raw_taxi_df(filename="yellow_tripdata_2024-01.parquet")
clean_df = clean_taxi_df(raw_df=raw_df)
clean_df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,time_deltas,time_mins
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,...,1.0,0.5,0.00,0.0,1.0,22.70,2.5,0.00,0 days 00:19:48,19.647348
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.80,1.0,N,140,236,1,...,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.00,0 days 00:06:36,6.549116
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.70,1.0,N,236,79,1,...,3.5,0.5,3.00,0.0,1.0,31.30,2.5,0.00,0 days 00:17:55,17.778535
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.40,1.0,N,79,211,1,...,3.5,0.5,2.00,0.0,1.0,17.00,2.5,0.00,0 days 00:08:18,8.236010
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.80,1.0,N,211,148,1,...,3.5,0.5,3.20,0.0,1.0,16.10,2.5,0.00,0 days 00:06:06,6.052971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2824457,2,2024-01-31 23:10:28,2024-01-31 23:18:30,1.0,3.51,1.0,N,138,129,1,...,6.0,0.5,4.76,0.0,1.0,30.31,0.0,1.75,0 days 00:08:02,7.971399
2824458,2,2024-01-31 23:01:04,2024-01-31 23:17:35,1.0,3.36,1.0,N,162,261,1,...,1.0,0.5,5.85,0.0,1.0,29.25,2.5,0.00,0 days 00:16:31,16.389328
2824459,1,2024-01-31 23:08:13,2024-01-31 23:25:00,3.0,3.30,1.0,N,43,249,1,...,3.5,0.5,1.00,0.0,1.0,23.70,2.5,0.00,0 days 00:16:47,16.653939
2824460,1,2024-01-31 23:49:46,2024-01-31 23:53:10,0.0,0.40,1.0,N,142,163,2,...,3.5,0.5,0.00,0.0,1.0,10.10,2.5,0.00,0 days 00:03:24,3.373787


In [3]:
# Check some assumptions about the PU/DO locations
print(f"PU min: {clean_df[['PULocationID']].min()}")
print(f"PU max: {clean_df[['PULocationID']].max()}")
print(f"DO min: {clean_df[['DOLocationID']].min()}")
print(f"DO max: {clean_df[['DOLocationID']].max()}")

PU min: PULocationID    1
dtype: int32
PU max: PULocationID    265
dtype: int32
DO min: DOLocationID    1
dtype: int32
DO max: DOLocationID    265
dtype: int32


In [4]:
location_ids = ['PULocationID', 'DOLocationID']
X_train, X_test, y_train, y_test = split_taxi_data(clean_df=clean_df, 
                                                   x_columns=location_ids, 
                                                   y_column="fare_amount", 
                                                   train_size=500000)

In [5]:
X_train

Unnamed: 0,PULocationID,DOLocationID
1025027,48,148
2167974,230,42
1547620,246,186
1933536,163,263
1702133,143,97
...,...,...
2267168,68,230
2181935,161,113
1312969,100,246
1099323,263,239


In [8]:
y_train

Unnamed: 0,fare_amount
447365,27.5
651533,10.0
1754327,10.0
1382983,11.4
2666024,35.9
...,...
1351504,9.3
1887470,10.0
1582068,20.5
1359707,14.2


In [8]:
"""
NYC Taxi example adaptation of
https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-create-a-neural-network-for-regression-with-pytorch.md
"""

import torch
from torch import nn
from sklearn.preprocessing import OneHotEncoder

class NYCTaxiExampleDataset(torch.utils.data.Dataset):
    """Trainin data object for our nyc taxi data"""
    def __init__(self, X_train: pd.DataFrame, y_train: pd.DataFrame):
        self.X_train = X_train
        self.y_train = y_train
        self.one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
        self.X = torch.from_numpy(self._one_hot_X().toarray()) # potentially smarter ways to deal with sparse here
        self.y = torch.from_numpy(self.y_train.values)
        self.X_enc_shape = self.X.shape[-1]
        print(f"encoded shape is {self.X_enc_shape}")
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]
        
    def _one_hot_X(self):
        return self.one_hot_encoder.fit_transform(self.X_train)

class MLP(nn.Module):
    """Multilayer Perceptron for regression. """
    def __init__(self, encoded_shape):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(encoded_shape, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1))
    
    def forward(self, x):
        return self.layers(x)

def main():
    """Simple training loop"""
    # Set fixed random number seed
    torch.manual_seed(42)
  
    # load data
    raw_df = raw_taxi_df(filename="yellow_tripdata_2024-01.parquet")
    clean_df = clean_taxi_df(raw_df=raw_df)
    location_ids = ['PULocationID', 'DOLocationID']
    X_train, X_test, y_train, y_test = split_taxi_data(clean_df=clean_df, 
                                                   x_columns=location_ids, 
                                                   y_column="fare_amount", 
                                                   train_size=500000)

    # Pytorch
    dataset = NYCTaxiExampleDataset(X_train=X_train, y_train=y_train)
    trainloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=1)
  
    # Initialize the MLP
    mlp = MLP(encoded_shape=dataset.X_enc_shape)
  
    # Define the loss function and optimizer
    loss_function = nn.L1Loss()
    optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)
  
    # Run the training loop
    for epoch in range(0, 5): # 5 epochs at maximum
        print(f'Starting epoch {epoch+1}')
        current_loss = 0.0
    
        # Iterate over the DataLoader for training data
        for i, data in enumerate(trainloader, 0):
            # Get and prepare inputs
            inputs, targets = data
            inputs, targets = inputs.float(), targets.float()
            targets = targets.reshape((targets.shape[0], 1))
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Perform forward pass
            outputs = mlp(inputs)
            
            # Compute loss
            loss = loss_function(outputs, targets)
            
            # Perform backward pass
            loss.backward()
            
            # Perform optimization
            optimizer.step()
            
            # Print statistics
            current_loss += loss.item()
            if i % 10 == 0:
                print('Loss after mini-batch %5d: %.3f' % (i + 1, current_loss / 500))
            current_loss = 0.0
    # Process is complete.
    print('Training process has finished.')
    return X_train, X_test, y_train, y_test, data, mlp

In [None]:
main()

## One hot encoding scratch work

In [11]:
# One hot encoder example
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(X)
enc.categories_
enc.transform([['Female', 1], ['Male', 4]]).toarray()
enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
enc.get_feature_names_out(['gender', 'group'])

array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],
      dtype=object)

In [12]:
# for the taxi dataset
enc_loc = OneHotEncoder(handle_unknown='ignore')
x_enc = enc_loc.fit_transform(X=X_train)

In [13]:
enc_loc.transform([[1, 129], [2, 32]]).toarray().shape



(2, 502)

In [14]:
torch.tensor(x_enc.toarray()).shape

torch.Size([500000, 502])

In [28]:
x_enc.shape[-1]

502