# Data Version Control with DVC

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

## 1. Initialize DVC

Ran `git init` and `dvc init` before starting.

In [2]:
!dvc version | head -1

DVC version: 3.66.1


## 2. Load Raw Data

In [3]:
# load the raw SMS data
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, 
                  names=['label', 'message'], encoding='latin-1')
print(f"Total messages loaded: {len(df)}")
print(f"\nFirst few rows:")
df.head()

Total messages loaded: 5572

First few rows:


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
print("Label counts:")
print(df['label'].value_counts())

Label counts:
label
ham     4825
spam     747
Name: count, dtype: int64


In [5]:
# convert labels to binary and save as raw_data.csv
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.to_csv('raw_data.csv', index=False)
print(f"Saved raw_data.csv with {len(df)} rows")

Saved raw_data.csv with 5572 rows


## 3. Track Raw Data with DVC

In [6]:
!dvc add raw_data.csv

To track the changes with git, run:

	git add raw_data.csv.dvc .gitignore

To enable auto staging, run:

	dvc config core.autostage true


In [7]:
!git add raw_data.csv.dvc .gitignore
!git commit -m "track raw data with dvc"

[main ffd78c9] track raw data with dvc
 2 files changed, 6 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 raw_data.csv.dvc


## 4. Split Data - Version 1 (seed=42)

70% train, 15% validation, 15% test with stratified split.

In [8]:
def split_and_save(df, random_state):
    """
    Split data into train/val/test and save to csv files.
    70% train, 15% validation, 15% test with stratified splitting.
    """
    # first separate out the test set
    train_val, test = train_test_split(
        df, test_size=0.15, random_state=random_state, stratify=df['label']
    )
    
    # then split remaining into train and validation
    val_ratio = 0.15 / (0.70 + 0.15)
    train, val = train_test_split(
        train_val, test_size=val_ratio, random_state=random_state, stratify=train_val['label']
    )
    
    train.to_csv('train.csv', index=False)
    val.to_csv('validation.csv', index=False)
    test.to_csv('test.csv', index=False)
    
    return train, val, test

In [9]:
df = pd.read_csv('raw_data.csv')
train, val, test = split_and_save(df, random_state=42)

print("Version 1 split (seed=42):")
print(f"  Train:      {len(train)} rows")
print(f"  Validation: {len(val)} rows")
print(f"  Test:       {len(test)} rows")
print()
print("Target distribution:")
print(f"  Train      - 0s: {(train['label']==0).sum()}, 1s: {(train['label']==1).sum()}")
print(f"  Validation - 0s: {(val['label']==0).sum()}, 1s: {(val['label']==1).sum()}")
print(f"  Test       - 0s: {(test['label']==0).sum()}, 1s: {(test['label']==1).sum()}")

Version 1 split (seed=42):
  Train:      3900 rows
  Validation: 836 rows
  Test:       836 rows

Target distribution:
  Train      - 0s: 3377, 1s: 523
  Validation - 0s: 724, 1s: 112
  Test       - 0s: 724, 1s: 112


## 5. Track Split Data with DVC and Tag as v1

In [10]:
!dvc add train.csv validation.csv test.csv

To track the changes with git, run:

	git add validation.csv.dvc .gitignore train.csv.dvc test.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


In [11]:
!git add train.csv.dvc validation.csv.dvc test.csv.dvc .gitignore
!git commit -m "v1: data split with random seed 42"
!git tag v1

[main 3319314] v1: data split with random seed 42
 4 files changed, 18 insertions(+)
 create mode 100644 test.csv.dvc
 create mode 100644 train.csv.dvc
 create mode 100644 validation.csv.dvc


## 6. Update Split - Version 2 (seed=0)

Changing the random seed so different samples end up in each split.

In [12]:
train, val, test = split_and_save(df, random_state=0)

print("Version 2 split (seed=0):")
print(f"  Train:      {len(train)} rows")
print(f"  Validation: {len(val)} rows")
print(f"  Test:       {len(test)} rows")
print()
print("Target distribution:")
print(f"  Train      - 0s: {(train['label']==0).sum()}, 1s: {(train['label']==1).sum()}")
print(f"  Validation - 0s: {(val['label']==0).sum()}, 1s: {(val['label']==1).sum()}")
print(f"  Test       - 0s: {(test['label']==0).sum()}, 1s: {(test['label']==1).sum()}")

Version 2 split (seed=0):
  Train:      3900 rows
  Validation: 836 rows
  Test:       836 rows

Target distribution:
  Train      - 0s: 3377, 1s: 523
  Validation - 0s: 724, 1s: 112
  Test       - 0s: 724, 1s: 112


Counts are the same due to stratified splitting, but the actual rows in each split are different.

## 7. Track Updated Data and Tag as v2

In [13]:
!dvc add train.csv validation.csv test.csv

To track the changes with git, run:

	git add validation.csv.dvc train.csv.dvc test.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


In [14]:
!git add train.csv.dvc validation.csv.dvc test.csv.dvc
!git commit -m "v2: updated data split with random seed 0"
!git tag v2

[main 4a85bb5] v2: updated data split with random seed 0
 3 files changed, 6 insertions(+), 6 deletions(-)


## 8. Checkout Version 1 and Print Distribution

In [15]:
# checkout v1 .dvc files from git, then restore the actual data
!git checkout v1 -- train.csv.dvc validation.csv.dvc test.csv.dvc
!dvc checkout

M	test.csv
M	train.csv
M	validation.csv


In [16]:
train = pd.read_csv('train.csv')
val = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')

print("Version 1 (seed=42) - Target distribution:")
print(f"  train.csv      - 0s: {(train['label']==0).sum()}, 1s: {(train['label']==1).sum()}")
print(f"  validation.csv - 0s: {(val['label']==0).sum()}, 1s: {(val['label']==1).sum()}")
print(f"  test.csv       - 0s: {(test['label']==0).sum()}, 1s: {(test['label']==1).sum()}")

Version 1 (seed=42) - Target distribution:
  train.csv      - 0s: 3377, 1s: 523
  validation.csv - 0s: 724, 1s: 112
  test.csv       - 0s: 724, 1s: 112


## 9. Checkout Version 2 and Print Distribution

In [17]:
# checkout v2
!git checkout v2 -- train.csv.dvc validation.csv.dvc test.csv.dvc
!dvc checkout

M	test.csv
M	train.csv
M	validation.csv


In [18]:
train = pd.read_csv('train.csv')
val = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')

print("Version 2 (seed=0) - Target distribution:")
print(f"  train.csv      - 0s: {(train['label']==0).sum()}, 1s: {(train['label']==1).sum()}")
print(f"  validation.csv - 0s: {(val['label']==0).sum()}, 1s: {(val['label']==1).sum()}")
print(f"  test.csv       - 0s: {(test['label']==0).sum()}, 1s: {(test['label']==1).sum()}")

Version 2 (seed=0) - Target distribution:
  train.csv      - 0s: 3377, 1s: 523
  validation.csv - 0s: 724, 1s: 112
  test.csv       - 0s: 724, 1s: 112


Same counts because of stratified split, but the actual samples are different. DVC restored the correct files for each version.

## 10. Bonus: Google Drive as Remote Storage

Using Google Drive as a DVC remote to decouple compute and storage.

In [19]:
!pip install dvc-gdrive -q

Add the Google Drive folder as a DVC remote:

In [None]:
# google drive folder ID from the URL
GDRIVE_FOLDER_ID = "128DkHX4kfsXdNVKr43zgZoy4vQMU4ZAc"

!dvc remote add -d gdrive gdrive://{GDRIVE_FOLDER_ID}

# to use our own GCP OAuth credentials (needed to avoid "app blocked" error)
# credentials removed before pushing to github
!dvc remote modify gdrive gdrive_client_id '<YOUR_CLIENT_ID>'
!dvc remote modify gdrive gdrive_client_secret '<YOUR_CLIENT_SECRET>'

In [21]:
# verify the remote is configured
!dvc remote list

gdrive	gdrive://128DkHX4kfsXdNVKr43zgZoy4vQMU4ZAc


Push data to Google Drive:

In [22]:
# push data to google drive (opens browser for authentication on first run)
!dvc push

Authentication successful.
4 files pushed


Data is now on Google Drive. Can pull it from any machine with `dvc pull`.

In [23]:
# commit the remote config to git
!git add .dvc/config
!git commit -m "configure google drive as dvc remote storage"

[main 2364369] configure google drive as dvc remote storage
 1 file changed, 6 insertions(+)
