In [None]:
#Importing the required functions
import matplotlib.pyplot as plt
import csv
import pandas as pd
import sklearn
import regex as re
import numpy as np
from zipfile import ZipFile
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 
import string
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

Loading the raw data

In [2]:
path = './data/SMSSpamCollection'
def read_csv(path):
    messages = pd.read_csv(path, sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
    print("No. of rows of data",len(messages))
    return messages

In [3]:
raw_data = read_csv(path)

No. of rows of data 5574


In [4]:
#Function to split the words into tokens
def split_into_tokens(data):
    tokenized_words = []
    regex=r"\w+"
    
    for i in range(len(data.message)):
        tokenized_words.append(re.findall(regex, data.message[i]))
        
    return tokenized_words

In [6]:
#Function to perform lematization and stopword removal
def lemmatize(data):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    
    for i in range(len(data)):
        temp = []

        for j in range(len(data[i])):
        
            if data[i][j].lower() in stop_words:
                continue
            
            elif data[i][j] in string.punctuation:
                continue
            
            else:
                temp.append(str(lemmatizer.lemmatize(data[i][j]).lower()))

        lemmatized_words.append(temp)             

    return lemmatized_words

In [7]:
#Calling the required functions for pre-processing
token_words = split_into_tokens(raw_data)
processed_words = lemmatize(token_words)

raw_data['processed_message'] = processed_words

In [8]:
raw_data.loc[raw_data.label == 'spam', 'Label'] = 1
raw_data.loc[raw_data.label == 'ham', 'Label'] = 0

In [9]:
#Function to split and save the train, validation and test split
def data_split(data, r_seed):
    
    #Train-validation and test split
    train_test_split_size = 0.1
    X_train_val, X_test, y_train_val, y_test = train_test_split(data.processed_message, data.label, test_size = train_test_split_size, random_state = r_seed)

    #Train and Validation split
    train_val_split_size = 0.1
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = train_val_split_size, random_state = r_seed)

    #Creating the splitted dataframes
    train_df = pd.DataFrame({'X_train': X_train,'y_train': y_train})
    val_df = pd.DataFrame({'X_val': X_val,'y_val': y_val})
    test_df = pd.DataFrame({'X_test': X_test,'y_test': y_test})

    train_df.to_csv('./data/train.csv',index = False)
    val_df.to_csv('./data/validation.csv',index = False)
    test_df.to_csv('./data/test.csv',index = False)
    data.to_csv('./data/raw_data.csv',index = False)

In [10]:
#First data split
data_split(raw_data, 42)

**Tracking the data splitting using dvc**

Initialising dvc

In [11]:
!cd .. && dvc init

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


Adding all the 3 splitted csv files to dvc

In [12]:
!dvc add ./data/train.csv
!dvc add ./data/validation.csv
!dvc add ./data/test.csv


To track the changes with git, run:

	git add 'data\.gitignore' train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

To track the changes with git, run:

	git add 'data\.gitignore' validation.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

To track the changes with git, run:

	git add test.csv.dvc 'data\.gitignore'

To enable auto staging, run:

	dvc config core.autostage true


In [13]:
!dvc config core.autostage true

Adding google drive folder as a remote data storage

In [19]:
!cd .. && dvc remote add --default myremote gdrive://1WM_n-19W7nOe2Pyr-3yHzAUOFyvEgPOy

Setting 'myremote' as a default remote.


In [20]:
!dvc remote modify myremote gdrive_acknowledge_abuse true

Pushing dvc tracked files to remote storage

In [21]:
!dvc push

3 files pushed


In [22]:
#2nd data split
data_split(raw_data, 121)

In [23]:
!dvc status

test.csv.dvc:
	changed outs:
		modified:           Assignment 2\data\test.csv
train.csv.dvc:
	changed outs:
		modified:           Assignment 2\data\train.csv
validation.csv.dvc:
	changed outs:
		modified:           Assignment 2\data\validation.csv


In [24]:
!dvc push

3 files pushed


Checkout for the different versions of the data splitting

In [25]:
!git log

commit b4e31b9f1fb6fd99a4f0f5eaee62322c65d0393f
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Tue Feb 28 00:16:57 2023 +0530

    Second Split Random Seed 121

commit 250db97be7ad21504f968454f6cf9cd55cad6bd1
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Tue Feb 28 00:07:45 2023 +0530

    First Split Random Seed 42

commit efe6d6c31d44abd0961ce32546915d7381ae2d97
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Tue Feb 28 00:07:15 2023 +0530

    dvc remote added and pushed

commit 2683509f5b8a4d5735d52e8e6ef0dba594985f9d
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Tue Feb 28 00:02:41 2023 +0530

    dvc init

commit 32d0797cf3e7baea5831db53520785188d28fe9a
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Tue Feb 28 00:01:44 2023 +0530

    dvc deleted for fresh start

commit a8cd5cea8129ba08b71b89fe78f42aa63563b417
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Mon Feb 27 23:59:51 2023 +0530

    dvc deleted for fresh start

commi

Checkout for 1st version

In [26]:
!git checkout 250db97be7ad21504f968454f6cf9cd55cad6bd1

Note: switching to '250db97be7ad21504f968454f6cf9cd55cad6bd1'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 250db97 First Split Random Seed 42


In [27]:
!dvc checkout

M       Assignment 2\data\train.csv
M       Assignment 2\data\test.csv
M       Assignment 2\data\validation.csv


Function to get the distribution of sms labels in the splitted data

In [43]:
def label_dist(path):
    data = pd.read_csv(path)

    ham_count = list(data.iloc[:,1]).count("ham")
    spam_count = list(data.iloc[:,1]).count("spam")

    print("Ham: {}, Spam: {}".format(ham_count, spam_count))

For 1st Split (Random Seed: 42)

In [45]:
train_path = './data/train.csv'
val_path = './data/validation.csv'
test_path = './data/test.csv'

print("First Split (Random Seed: 42)")
print("\nTraining dataset:")
label_dist(train_path)
print("\nValidation dataset:")
label_dist(val_path)
print("\nTesting dataset:")
label_dist(test_path)

First Split (Random Seed: 42)

Training dataset:
Ham: 3909, Spam: 605

Validation dataset:
Ham: 440, Spam: 62

Testing dataset:
Ham: 478, Spam: 80


Checkout for 2nd Split

In [47]:
!git checkout b4e31b9f1fb6fd99a4f0f5eaee62322c65d0393f

Previous HEAD position was 250db97 First Split Random Seed 42
HEAD is now at b4e31b9 Second Split Random Seed 121


In [48]:
!dvc checkout

M       Assignment 2\data\train.csv
M       Assignment 2\data\validation.csv
M       Assignment 2\data\test.csv


For 2nd Split (Random Seed: 121)

In [49]:
print("2nd Split (Random Seed: 121)")
print("\nTraining dataset:")
label_dist(train_path)
print("\nValidation dataset:")
label_dist(val_path)
print("\nTesting dataset:")
label_dist(test_path)

2nd Split (Random Seed: 121)

Training dataset:
Ham: 3910, Spam: 604

Validation dataset:
Ham: 430, Spam: 72

Testing dataset:
Ham: 487, Spam: 71
