* track the versions of data using dvc
* load the raw data into raw_data.csv and save the split data into train.csv/validation.csv/test.csv
* update train/validation/test split by choosing different random seed
* checkout the first version (before update) using dvc and print the distribution of target variable (number of 0s and number of 1s) in train.csv, validation.csv, and test.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
FILE_PATH = 'sms_spam_collection/SMSSpamCollection'

In [4]:
df = pd.read_csv(FILE_PATH, sep='\t', names=['label', 'message'])

In [5]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [6]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/turing/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/turing/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /home/turing/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/turing/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [8]:
df['preprocessed_message'] = df['message'].apply(preprocess_text)

In [9]:
df.sample(10)

Unnamed: 0,label,message,preprocessed_message
2869,ham,"Aight, tomorrow around &lt;#&gt; it is",aight tomorrow around ltgt
2643,ham,"They can try! They can get lost, in fact. Tee hee",try get lost fact tee hee
336,ham,"Ta-Daaaaa! I am home babe, are you still up ?",tadaaaaa home babe still
1168,ham,Lol now I'm after that hot air balloon!,lol im hot air balloon
1643,ham,Sleeping nt feeling well,sleeping nt feeling well
295,ham,I accidentally deleted the message. Resend ple...,accidentally deleted message resend please
1985,spam,Urgent! Please call 09061743810 from landline....,urgent please call landline abta complimentary...
5222,ham,5 nights...We nt staying at port step liao...T...,nightswe nt staying port step liaotoo ex
3503,ham,I will come to ur home now,come ur home
2566,ham,I told her I had a Dr appt next week. She thin...,told dr appt next week think im gon na die tol...


In [10]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [11]:
!dvc init --subdir

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

## Version 1

In [12]:
# split the data into train/validation/test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [14]:
# print percentage of each label
print("Train : ", train_df['label'].value_counts() / len(train_df))
print("Validation : ", val_df['label'].value_counts() / len(val_df))
print("Test : ", test_df['label'].value_counts() / len(test_df))

Train :  label
0    0.866547
1    0.133453
Name: count, dtype: float64
Validation :  label
0    0.863677
1    0.136323
Name: count, dtype: float64
Test :  label
0    0.866368
1    0.133632
Name: count, dtype: float64


In [22]:
# store the splits at train.csv/validation.csv/test.csv
train_df.to_csv('processed_data/train.csv', index=False, sep='\t')
val_df.to_csv('processed_data/validation.csv', index=False, sep='\t')
test_df.to_csv('processed_data/test.csv', index=False, sep='\t')

In [23]:
!dvc add processed_data/train.csv
!dvc add processed_data/validation.csv
!dvc add processed_data/test.csv


[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in processed_data/train.csv |0.00 [00:00, [A
                                                                                [A
![A
  0% Checking cache in '/mnt/c/Users/saisa/AppliedMachineLearning/assignment_2/.[A
                                                                                [A
![A
  0%|          |Checking out /mnt/c/Users/saisa/Applie0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00,  6.43file/s][A

To track the changes with git, run:

	git add processed_data/train.csv.dvc processed_data/.gitignore

To enable auto staging, run:

	dvc config core.autostage true
[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                

In [24]:
!git add processed_data/train.csv.dvc
!git add processed_data/validation.csv.dvc
!git add processed_data/test.csv.dvc
!git add processed_data/.gitignore
!git commit -m "version 1"
!git tag processed-data-v1


[detached HEAD 3ffe721] version 1
 4 files changed, 21 insertions(+)
 create mode 100644 assignment_2/processed_data/.gitignore
 create mode 100644 assignment_2/processed_data/test.csv.dvc
 create mode 100644 assignment_2/processed_data/train.csv.dvc
 create mode 100644 assignment_2/processed_data/validation.csv.dvc


In [27]:
!git checkout processed-data-v1
!dvc checkout

D	assignment_2/.python-version
M	assignment_2/prepare.ipynb
D	assignment_2/pyproject.toml
D	assignment_2/uv.lock
HEAD is now at 3ffe721 version 1
Building workspace index                              |8.00 [00:00,  150entry/s]
Comparing indexes                                    |9.00 [00:00, 2.96kentry/s]
Applying changes                                      |0.00 [00:00,     ?file/s]
[0m

In [28]:
train_data = pd.read_csv('processed_data/train.csv', sep='\t')
val_data = pd.read_csv('processed_data/validation.csv', sep='\t')
test_data = pd.read_csv('processed_data/test.csv', sep='\t')

print("Train : ", train_data['label'].value_counts() / len(train_data))
print("Validation : ", val_data['label'].value_counts() / len(val_data))
print("Test : ", test_data['label'].value_counts() / len(test_data))

Train :  label
0    0.866547
1    0.133453
Name: count, dtype: float64
Validation :  label
0    0.863677
1    0.136323
Name: count, dtype: float64
Test :  label
0    0.866368
1    0.133632
Name: count, dtype: float64


## Version 2

In [29]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=43) # different random seed
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=43) # different random seed

print("Train : ", train_df['label'].value_counts() / len(train_df))
print("Validation : ", val_df['label'].value_counts() / len(val_df))
print("Test : ", test_df['label'].value_counts() / len(test_df))

Train :  label
0    0.869539
1    0.130461
Name: count, dtype: float64
Validation :  label
0    0.867265
1    0.132735
Name: count, dtype: float64
Test :  label
0    0.853812
1    0.146188
Name: count, dtype: float64


In [30]:
train_df.to_csv('processed_data/train.csv', index=False, sep='\t')
val_df.to_csv('processed_data/validation.csv', index=False, sep='\t')
test_df.to_csv('processed_data/test.csv', index=False, sep='\t')

In [31]:
!dvc add processed_data/train.csv
!dvc add processed_data/validation.csv
!dvc add processed_data/test.csv



[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in processed_data/train.csv |0.00 [00:00, [A
                                                                                [A
![A
  0% Checking cache in '/mnt/c/Users/saisa/AppliedMachineLearning/assignment_2/.[A
                                                                                [A
![A
  0%|          |Adding processed_data/train.csv to cac0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /mnt/c/Users/saisa/Applie0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00,  4.55file/s][A

To track the changes with git, run:

	git add processed_data/train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true
[?25l 

In [32]:
!git add processed_data/train.csv.dvc
!git add processed_data/validation.csv.dvc
!git add processed_data/test.csv.dvc
!git add processed_data/.gitignore
!git commit -m "version 2"
!git tag processed-data-v2


[detached HEAD 8fea163] version 2
 3 files changed, 6 insertions(+), 6 deletions(-)


In [33]:
!git checkout processed-data-v2
!dvc checkout

D	assignment_2/.python-version
M	assignment_2/prepare.ipynb
D	assignment_2/pyproject.toml
D	assignment_2/uv.lock
HEAD is now at 8fea163 version 2
Building workspace index                              |8.00 [00:00,  180entry/s]
Comparing indexes                                    |9.00 [00:00, 3.18kentry/s]
Applying changes                                      |0.00 [00:00,     ?file/s]
[0m

In [34]:
train_data = pd.read_csv('processed_data/train.csv', sep='\t')
val_data = pd.read_csv('processed_data/validation.csv', sep='\t')
test_data = pd.read_csv('processed_data/test.csv', sep='\t')

print("Train : ", train_data['label'].value_counts() / len(train_data))
print("Validation : ", val_data['label'].value_counts() / len(val_data))
print("Test : ", test_data['label'].value_counts() / len(test_data))

Train :  label
0    0.869539
1    0.130461
Name: count, dtype: float64
Validation :  label
0    0.867265
1    0.132735
Name: count, dtype: float64
Test :  label
0    0.853812
1    0.146188
Name: count, dtype: float64


## Load versions and check class distribution

### Version 1

In [35]:
!git checkout processed-data-v1
!dvc checkout

D	assignment_2/.python-version
M	assignment_2/prepare.ipynb
D	assignment_2/pyproject.toml
D	assignment_2/uv.lock
Previous HEAD position was 8fea163 version 2
HEAD is now at 3ffe721 version 1
Building workspace index                              |8.00 [00:00, 88.2entry/s]
Comparing indexes                                    |9.00 [00:00, 1.62kentry/s]
Applying changes                                      |3.00 [00:00,  8.60file/s]
[33mM[0m       processed_data/validation.csv
[33mM[0m       processed_data/test.csv
[33mM[0m       processed_data/train.csv
[0m

In [36]:
train_data = pd.read_csv('processed_data/train.csv', sep='\t')
val_data = pd.read_csv('processed_data/validation.csv', sep='\t')
test_data = pd.read_csv('processed_data/test.csv', sep='\t')

print("Train : ", train_data['label'].value_counts() / len(train_data))
print("Validation : ", val_data['label'].value_counts() / len(val_data))
print("Test : ", test_data['label'].value_counts() / len(test_data))


Train :  label
0    0.866547
1    0.133453
Name: count, dtype: float64
Validation :  label
0    0.863677
1    0.136323
Name: count, dtype: float64
Test :  label
0    0.866368
1    0.133632
Name: count, dtype: float64


### Version 2

In [37]:
!git checkout processed-data-v2
!dvc checkout

D	assignment_2/.python-version
M	assignment_2/prepare.ipynb
D	assignment_2/pyproject.toml
D	assignment_2/uv.lock
Previous HEAD position was 3ffe721 version 1
HEAD is now at 8fea163 version 2
Building workspace index                              |8.00 [00:00, 79.0entry/s]
Comparing indexes                                    |9.00 [00:00, 2.00kentry/s]
Applying changes                                      |3.00 [00:00,  6.33file/s]
[33mM[0m       processed_data/train.csv
[33mM[0m       processed_data/validation.csv
[33mM[0m       processed_data/test.csv
[0m

In [38]:
train_data = pd.read_csv('processed_data/train.csv', sep='\t')
val_data = pd.read_csv('processed_data/validation.csv', sep='\t')
test_data = pd.read_csv('processed_data/test.csv', sep='\t')

print("Train : ", train_data['label'].value_counts() / len(train_data))
print("Validation : ", val_data['label'].value_counts() / len(val_data))
print("Test : ", test_data['label'].value_counts() / len(test_data))

Train :  label
0    0.869539
1    0.130461
Name: count, dtype: float64
Validation :  label
0    0.867265
1    0.132735
Name: count, dtype: float64
Test :  label
0    0.853812
1    0.146188
Name: count, dtype: float64


In [40]:
!git log --oneline

[33m8fea163[m[33m ([m[1;36mHEAD[m[33m, [m[1;33mtag: processed-data-v2[m[33m)[m version 2
[33m3ffe721[m[33m ([m[1;33mtag: processed-data-v1[m[33m)[m version 1
[33md2cbeb8[m[33m ([m[1;33mtag: data-v1[m[33m, [m[1;32mmain[m[33m)[m version 1
[33m7dc0db7[m version 1
[33m8819bd3[m[33m ([m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m)[m initial data
[33m7f0b1ac[m[33m ([m[1;33mtag: v2[m[33m)[m version 2
[33m8f75402[m[33m ([m[1;33mtag: v1[m[33m)[m version 1
[33mafb6c85[m version 2
[33mb70cab0[m version 1
[33m66394b3[m version 2
[33m9e0eac3[m version 1
[33m670633b[m version 2
[33ma1fc932[m version 1
[33mf960fd8[m Add data splits version 1 and 2
[33m580b185[m train and measure metrics on a bunch of models
[33m2ea28b1[m load and preprocess data
[33m139493d[m Initial commit


In [41]:
!git checkout

D	assignment_2/.python-version
M	assignment_2/prepare.ipynb
D	assignment_2/pyproject.toml
D	assignment_2/uv.lock
