In [1]:
#Importing the required functions
import matplotlib.pyplot as plt
import csv
import pandas as pd
import sklearn
import regex as re
import numpy as np
from zipfile import ZipFile
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 
import string
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

Loading the raw data

In [2]:
path = './SMSSpamCollection.txt'
def read_csv(path):
    messages = pd.read_csv(path, sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
    print("No. of rows = ",len(messages))
    return messages

In [3]:
raw_data = read_csv(path)

No. of rows =  5574


In [4]:
def split_into_tokens(data):
    tokenized_words = []
    regex=r"\w+"
    
    for i in range(len(data.message)):
        tokenized_words.append(re.findall(regex, data.message[i]))
        
    return tokenized_words

In [5]:
def lemmatize(data):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    
    for i in range(len(data)):
        temp = []

        for j in range(len(data[i])):
        
            if data[i][j].lower() in stop_words:
                continue
            
            elif data[i][j] in string.punctuation:
                continue
            
            else:
                temp.append(str(lemmatizer.lemmatize(data[i][j]).lower()))

        lemmatized_words.append(temp)             

    return lemmatized_words

In [6]:
token_words = split_into_tokens(raw_data)
processed_words = lemmatize(token_words)

raw_data['processed_message'] = processed_words

In [7]:
raw_data.loc[raw_data.label == 'spam', 'Label'] = 1
raw_data.loc[raw_data.label == 'ham', 'Label'] = 0

In [8]:
def data_split(data, r_seed):
    
    #Train-validation and test split
    train_test_split_size = 0.1
    X_train_val, X_test, y_train_val, y_test = train_test_split(data.processed_message, data.label, test_size = train_test_split_size, random_state = r_seed)

    #Train and Validation split
    train_val_split_size = 0.1
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = train_val_split_size, random_state = r_seed)

    #Creating the splitted dataframes
    train_df = pd.DataFrame({'X_train': X_train,'y_train': y_train})
    val_df = pd.DataFrame({'X_val': X_val,'y_val': y_val})
    test_df = pd.DataFrame({'X_test': X_test,'y_test': y_test})

    train_df.to_csv('./train.csv',index = False)
    val_df.to_csv('./validation.csv',index = False)
    test_df.to_csv('./test.csv',index = False)

In [9]:
#First data split
data_split(raw_data, 42)

**Tracking the data splitting using dvc**

Initialising dvc

In [10]:
# !pip install dvc

In [10]:
!cd .. && dvc init

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


Adding all the 3 splitted csv files to dvc

In [39]:
# !git rm -r --cached 'C:/Users/lenovo/Desktop/Fourth Sem/AML/Applied-Machine-Learning/Assignment 2/train.csv'
# !git commit -m "stop tracking ./Assignment 2/train.csv"
!git rm -r --cached "C:/Users/lenovo/Desktop/Fourth Sem/AML/Applied-Machine-Learning"

rm '.dvc/.gitignore'
rm '.dvc/config'
rm '.dvcignore'
rm 'Assignment 1/SMSSpamCollection'
rm 'Assignment 1/prepare.ipynb'
rm 'Assignment 1/readme'
rm 'Assignment 1/test.csv'
rm 'Assignment 1/train.csv'
rm 'Assignment 1/train.ipynb'
rm 'Assignment 1/validation.csv'
rm 'Assignment 2/Applied-Machine-Learning'
rm 'Assignment 2/SMSSpamCollection.txt'
rm 'Assignment 2/mlruns/0/0bb59c9e978a4a34aa574156c1f77bb2/artifacts/confusion_matrix.json'
rm 'Assignment 2/mlruns/0/0bb59c9e978a4a34aa574156c1f77bb2/artifacts/estimator.html'
rm 'Assignment 2/mlruns/0/0bb59c9e978a4a34aa574156c1f77bb2/artifacts/model/MLmodel'
rm 'Assignment 2/mlruns/0/0bb59c9e978a4a34aa574156c1f77bb2/artifacts/model/conda.yaml'
rm 'Assignment 2/mlruns/0/0bb59c9e978a4a34aa574156c1f77bb2/artifacts/model/model.pkl'
rm 'Assignment 2/mlruns/0/0bb59c9e978a4a34aa574156c1f77bb2/artifacts/model/python_env.yaml'
rm 'Assignment 2/mlruns/0/0bb59c9e978a4a34aa574156c1f77bb2/artifacts/model/requirements.txt'
rm 'Assignment 2/mlruns/0/0bb59c9

In [11]:
!dvc add "./train.csv"
!dvc add ./validation.csv
!dvc add ./test.csv


To track the changes with git, run:

	git add train.csv.dvc .gitignore

To enable auto staging, run:

	dvc config core.autostage true

To track the changes with git, run:

	git add .gitignore validation.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

To track the changes with git, run:

	git add test.csv.dvc .gitignore

To enable auto staging, run:

	dvc config core.autostage true


In [12]:
!dvc config core.autostage true

Adding google drive folder as a remote data storage

In [13]:
!cd .. && dvc remote add --default myremote gdrive://1vdxgrHq4o9C2Ef3MyHf3DuU5Vhcs7cXr

Setting 'myremote' as a default remote.


In [14]:
!dvc remote modify myremote gdrive_acknowledge_abuse true

In [47]:
# !pip install dvc_gdrive

Collecting dvc_gdrive
  Downloading dvc_gdrive-2.19.1-py3-none-any.whl (11 kB)
Collecting pydrive2[fsspec]>=1.15.0
  Downloading PyDrive2-1.15.1-py3-none-any.whl (44 kB)
     ---------------------------------------- 44.6/44.6 kB 2.1 MB/s eta 0:00:00
Collecting google-api-python-client>=1.12.5
  Downloading google_api_python_client-2.81.0-py2.py3-none-any.whl (11.1 MB)
     ---------------------------------------- 11.1/11.1 MB 1.3 MB/s eta 0:00:00
Collecting pyOpenSSL>=19.1.0
  Downloading pyOpenSSL-23.0.0-py3-none-any.whl (57 kB)
     -------------------------------------- 57.3/57.3 kB 747.1 kB/s eta 0:00:00
Collecting oauth2client>=4.0.0
  Downloading oauth2client-4.1.3-py2.py3-none-any.whl (98 kB)
     ---------------------------------------- 98.2/98.2 kB 1.9 MB/s eta 0:00:00
Collecting uritemplate<5,>=3.0.1
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl (10 kB)
Collecting google-auth<3.0.0dev,>=1.19.0
  Downloading google_auth-2.16.2-py2.py3-none-any.whl (177 kB)
     --------


[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Pushing dvc tracked files to remote storage

In [15]:
!dvc push

Everything is up to date.


In [50]:
#2nd data split
data_split(raw_data, 121)

In [51]:
!dvc status

test.csv.dvc:
	changed outs:
		modified:           Assignment 2\test.csv
train.csv.dvc:
	changed outs:
		modified:           Assignment 2\train.csv
validation.csv.dvc:
	changed outs:
		modified:           Assignment 2\validation.csv


In [52]:
!dvc push

Everything is up to date.


Checkout for the different versions of the data splitting

In [53]:
!git log

commit 95e40dc40b800d852e3664c6f178331b14ecebd4
Author: Krishna Gupta <krishnagupta1602@gmail.com>
Date:   Tue Mar 21 14:40:34 2023 +0530

    stop tracking ./train.csv

commit 5c7193b63db8f2b07ae15a901ca1c0f971d11f4f
Author: Krishna Gupta <krishnagupta1602@gmail.com>
Date:   Tue Mar 21 14:31:37 2023 +0530

    commit

commit 58fb8476d8f180546c49aeae80b539ddda167bb1
Author: Krishna Gupta <krishnagupta1602@gmail.com>
Date:   Tue Mar 14 17:03:04 2023 +0530

    stop tracking Assignment 2\train.csv

commit 4c95985deca93fd26e159297da9185a73866e66f
Author: Krishna Gupta <krishnagupta1602@gmail.com>
Date:   Tue Mar 14 16:50:31 2023 +0530

    Assignment2_1st_commit

commit 0396b49fc34177868f25ea3cfb3ea9b02e0419a4
Author: krishnagupta1602 <111596704+krishnagupta1602@users.noreply.github.com>
Date:   Tue Mar 14 15:18:37 2023 +0530

    Delete Assignment 2 directory

commit 25f97b3a030eefa5be7e3e920a0b8f37a2312448
Author: krishnagupta1602 <111596704+krishnagupta1602@users.noreply.github.com>
Da

Checkout for 1st version

In [26]:
!git checkout 250db97be7ad21504f968454f6cf9cd55cad6bd1

Note: switching to '250db97be7ad21504f968454f6cf9cd55cad6bd1'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 250db97 First Split Random Seed 42


In [27]:
!dvc checkout

M       Assignment 2\data\train.csv
M       Assignment 2\data\test.csv
M       Assignment 2\data\validation.csv


Function to get the distribution of sms labels in the splitted data

In [43]:
def label_dist(path):
    data = pd.read_csv(path)

    ham_count = list(data.iloc[:,1]).count("ham")
    spam_count = list(data.iloc[:,1]).count("spam")

    print("Ham: {}, Spam: {}".format(ham_count, spam_count))

For 1st Split (Random Seed: 42)

In [45]:
train_path = './data/train.csv'
val_path = './data/validation.csv'
test_path = './data/test.csv'

print("First Split (Random Seed: 42)")
print("\nTraining dataset:")
label_dist(train_path)
print("\nValidation dataset:")
label_dist(val_path)
print("\nTesting dataset:")
label_dist(test_path)

First Split (Random Seed: 42)

Training dataset:
Ham: 3909, Spam: 605

Validation dataset:
Ham: 440, Spam: 62

Testing dataset:
Ham: 478, Spam: 80


Checkout for 2nd Split

In [47]:
!git checkout b4e31b9f1fb6fd99a4f0f5eaee62322c65d0393f

Previous HEAD position was 250db97 First Split Random Seed 42
HEAD is now at b4e31b9 Second Split Random Seed 121


In [48]:
!dvc checkout

M       Assignment 2\data\train.csv
M       Assignment 2\data\validation.csv
M       Assignment 2\data\test.csv


For 2nd Split (Random Seed: 121)

In [49]:
print("2nd Split (Random Seed: 121)")
print("\nTraining dataset:")
label_dist(train_path)
print("\nValidation dataset:")
label_dist(val_path)
print("\nTesting dataset:")
label_dist(test_path)

2nd Split (Random Seed: 121)

Training dataset:
Ham: 3910, Spam: 604

Validation dataset:
Ham: 430, Spam: 72

Testing dataset:
Ham: 487, Spam: 71
