# Lecture Plan

1. Data Prep
2. Vertex AI Training
3. Vertex AI Testing

---

In [None]:
! pip install pandas nltk

In [23]:
import pandas as pd

df = pd.read_csv('C:/TrainingMaterial/generative-ai/genai-material/training-data.csv')

df.head(10)

Unnamed: 0,Text,Category
0,I'm interested in upgrading my internet plan. ...,Sales
1,I'm having trouble with my internet connection...,Service
2,What are your internet speeds like? I'm consid...,General
3,I want to know more about your internet packag...,Sales
4,My internet bill seems higher than usual this ...,Service
5,I'm moving to a new apartment next month and n...,Sales
6,I'm experiencing slow speeds with my current i...,Service
7,What are your customer service hours? I need h...,Service
8,I'm interested in upgrading my internet plan. ...,Sales
9,My internet is down again. This is the third t...,Service


In [24]:
# lowercase
df['Text'] = df['Text'].str.lower()

df.head(10)

Unnamed: 0,Text,Category
0,i'm interested in upgrading my internet plan. ...,Sales
1,i'm having trouble with my internet connection...,Service
2,what are your internet speeds like? i'm consid...,General
3,i want to know more about your internet packag...,Sales
4,my internet bill seems higher than usual this ...,Service
5,i'm moving to a new apartment next month and n...,Sales
6,i'm experiencing slow speeds with my current i...,Service
7,what are your customer service hours? i need h...,Service
8,i'm interested in upgrading my internet plan. ...,Sales
9,my internet is down again. this is the third t...,Service


In [25]:
# remove whitespaces

def remove_whitespaces(text):
    return ' '.join(text.split())

df['Text'] = df['Text'].apply(remove_whitespaces)

df.head(10)

Unnamed: 0,Text,Category
0,i'm interested in upgrading my internet plan. ...,Sales
1,i'm having trouble with my internet connection...,Service
2,what are your internet speeds like? i'm consid...,General
3,i want to know more about your internet packag...,Sales
4,my internet bill seems higher than usual this ...,Service
5,i'm moving to a new apartment next month and n...,Sales
6,i'm experiencing slow speeds with my current i...,Service
7,what are your customer service hours? i need h...,Service
8,i'm interested in upgrading my internet plan. ...,Sales
9,my internet is down again. this is the third t...,Service


In [26]:
# tokenize
import nltk

nltk.download('punkt')

from nltk import word_tokenize

df['Text'] = df['Text'].apply(word_tokenize)

df.head(10)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KrishnaGopikaUrlagan\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Text,Category
0,"[i, 'm, interested, in, upgrading, my, interne...",Sales
1,"[i, 'm, having, trouble, with, my, internet, c...",Service
2,"[what, are, your, internet, speeds, like, ?, i...",General
3,"[i, want, to, know, more, about, your, interne...",Sales
4,"[my, internet, bill, seems, higher, than, usua...",Service
5,"[i, 'm, moving, to, a, new, apartment, next, m...",Sales
6,"[i, 'm, experiencing, slow, speeds, with, my, ...",Service
7,"[what, are, your, customer, service, hours, ?,...",Service
8,"[i, 'm, interested, in, upgrading, my, interne...",Sales
9,"[my, internet, is, down, again, ., this, is, t...",Service


In [27]:
# remove stopwords

nltk.download('stopwords')

from nltk.corpus import stopwords

en_stopwords = stopwords.words('english')

def remove_stopwords(text):

    result = []

    for token in text:
        if token not in en_stopwords:
            result.append(token)

    return result

df['Text'] = df['Text'].apply(lambda x: remove_stopwords(x))

df.head(10)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KrishnaGopikaUrlagan\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Text,Category
0,"['m, interested, upgrading, internet, plan, .,...",Sales
1,"['m, trouble, internet, connection, ., keeps, ...",Service
2,"[internet, speeds, like, ?, 'm, considering, s...",General
3,"[want, know, internet, packages, ., send, info...",Sales
4,"[internet, bill, seems, higher, usual, month, ...",Service
5,"['m, moving, new, apartment, next, month, need...",Sales
6,"['m, experiencing, slow, speeds, current, inte...",Service
7,"[customer, service, hours, ?, need, help, bill...",Service
8,"['m, interested, upgrading, internet, plan, .,...",Sales
9,"[internet, ., third, time, week, ., please, fi...",Service


In [28]:
# remove punctuation

from nltk.tokenize import RegexpTokenizer

def remove_puntuations(text):

    tokenizer = RegexpTokenizer(r'\w+')
    result = tokenizer.tokenize(' '.join(text))
    return result

df['Text'] = df['Text'].apply(remove_puntuations)

df.head(10)

Unnamed: 0,Text,Category
0,"[m, interested, upgrading, internet, plan, pro...",Sales
1,"[m, trouble, internet, connection, keeps, drop...",Service
2,"[internet, speeds, like, m, considering, switc...",General
3,"[want, know, internet, packages, send, informa...",Sales
4,"[internet, bill, seems, higher, usual, month, ...",Service
5,"[m, moving, new, apartment, next, month, need,...",Sales
6,"[m, experiencing, slow, speeds, current, inter...",Service
7,"[customer, service, hours, need, help, billing...",Service
8,"[m, interested, upgrading, internet, plan, pro...",Sales
9,"[internet, third, time, week, please, fix]",Service


In [29]:
# Lemmatize

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

def lemmatize_text(text):
    result = []

    wordnet_lem = WordNetLemmatizer()

    for token, pos in pos_tag(text):
        pos = pos[0].lower()
        if pos not in ['a','n','v','r']:
            pos = 'n'
        result.append(wordnet_lem.lemmatize(token,pos))
    return result

df['Text'] = df['Text'].apply(lemmatize_text)

df.head(10)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\KrishnaGopikaUrlagan\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KrishnaGopikaUrlagan\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\KrishnaGopikaUrlagan\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,Text,Category
0,"[m, interested, upgrading, internet, plan, pro...",Sales
1,"[m, trouble, internet, connection, keep, drop]",Service
2,"[internet, speed, like, m, consider, switch, p...",General
3,"[want, know, internet, package, send, informat...",Sales
4,"[internet, bill, seem, higher, usual, month, h...",Service
5,"[m, move, new, apartment, next, month, need, s...",Sales
6,"[m, experience, slow, speed, current, internet...",Service
7,"[customer, service, hour, need, help, bill, is...",Service
8,"[m, interested, upgrading, internet, plan, pro...",Sales
9,"[internet, third, time, week, please, fix]",Service


In [30]:
# remove words with len less than 1

def remove_words(text):
    result = []
    for word in text:
        if len(word)>1:
            result.append(word)
    return result

df['Text'] = df['Text'].apply(lambda x: remove_words(x))

df.head(10)


Unnamed: 0,Text,Category
0,"[interested, upgrading, internet, plan, provid...",Sales
1,"[trouble, internet, connection, keep, drop]",Service
2,"[internet, speed, like, consider, switch, prov...",General
3,"[want, know, internet, package, send, informat...",Sales
4,"[internet, bill, seem, higher, usual, month, h...",Service
5,"[move, new, apartment, next, month, need, set,...",Sales
6,"[experience, slow, speed, current, internet, p...",Service
7,"[customer, service, hour, need, help, bill, is...",Service
8,"[interested, upgrading, internet, plan, provid...",Sales
9,"[internet, third, time, week, please, fix]",Service


In [31]:
df['Text'] = [' '.join(map(str,token)) for token in df['Text']]

df.head(10)

Unnamed: 0,Text,Category
0,interested upgrading internet plan provide option,Sales
1,trouble internet connection keep drop,Service
2,internet speed like consider switch provider,General
3,want know internet package send information,Sales
4,internet bill seem higher usual month help und...,Service
5,move new apartment next month need set interne...,Sales
6,experience slow speed current internet plan an...,Service
7,customer service hour need help bill issue,Service
8,interested upgrading internet plan provide option,Sales
9,internet third time week please fix,Service


In [32]:
df.to_csv('./training_data_c.csv', index = False, encoding='utf-8')

### References

1. [nltk](https://www.nltk.org/)
2. [pandas](https://pandas.pydata.org/docs/user_guide/10min.html)