In [0]:
! /databricks/python/bin/pip install nltk

In [0]:
! /databricks/python/bin/python -m nltk.downloader stopwords

In [0]:
! /databricks/python/bin/python -m nltk.downloader punkt

In [0]:
! /databricks/python/bin/python -m nltk.downloader wordnet

In [0]:
dbutils.library.installPyPI("fuzzywuzzy")
dbutils.library.installPyPI("nltk")
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 
import nltk
from pyspark.sql.functions import *
from nltk.corpus import stopwords
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import re

In [0]:
df_types = spark.table("irumdb.cs_types_list")
df_types=df_types.filter(df_types.Questions.isNotNull())
df_types=df_types.na.replace(['14 Day trial '], [None], 'Questions')
df_types=df_types.filter(df_types.Questions.isNotNull())

# df_types_agg = df_types.groupby('Categroy').agg(collect_set('Questions')).sort(['Categroy'],ascending=True)
# df_types_pd=df_types_agg.toPandas()
# df_types_pd.rename(columns={"collect_set(Questions)":"list_of_questions"},inplace=True)

df_types.show()

In [0]:
display(df_types)

Categroy,Questions
14 day trial,Can I try before I buy?
14 day trial,How can I try the offers?
14 day trial,How can I try the app?
14 day trial,Can I try first?
14 day trial,Can I try an offer before I buy?
14 day trial,I want a free trial
14 day trial,I want to try first
14 day trial,Why can't I access the 14-day trial?
14 day trial,I cant access the free trial?
14 day trial,The free trial isn't working


In [0]:
def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

In [0]:
df_types_pd = df_types.toPandas()
df_types_pd.head()

Unnamed: 0,Categroy,Questions
0,14 day trial,Can I try before I buy?
1,14 day trial,How can I try the offers?
2,14 day trial,How can I try the app?
3,14 day trial,Can I try first?
4,14 day trial,Can I try an offer before I buy?


In [0]:
df_types_pd['cleaned_text'] = df_types_pd['Questions'].apply(lambda x: (clean_text(x)))
df_types_pd.head()

Unnamed: 0,Categroy,Questions,cleaned_text
0,14 day trial,Can I try before I buy?,can i try before i buy
1,14 day trial,How can I try the offers?,how can i try the offers
2,14 day trial,How can I try the app?,how can i try the app
3,14 day trial,Can I try first?,can i try first
4,14 day trial,Can I try an offer before I buy?,can i try an offer before i buy


In [0]:
stop = stopwords.words('english')
df_types_pd['stopwords_removed'] = df_types_pd['cleaned_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df_types_pd.head()

Unnamed: 0,Categroy,Questions,cleaned_text,stopwords_removed
0,14 day trial,Can I try before I buy?,can i try before i buy,try buy
1,14 day trial,How can I try the offers?,how can i try the offers,try offers
2,14 day trial,How can I try the app?,how can i try the app,try app
3,14 day trial,Can I try first?,can i try first,try first
4,14 day trial,Can I try an offer before I buy?,can i try an offer before i buy,try offer buy


In [0]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
word = "please I am including"
print("Lemmatized Word:",lem.lemmatize(word,"v"))

In [0]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [0]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]

df_types_pd['text_lemmatized'] = df_types_pd.stopwords_removed.apply(lemmatize_text)
df_types_pd.head()

Unnamed: 0,Categroy,Questions,cleaned_text,stopwords_removed,text_lemmatized,text_stemmed
0,14 day trial,Can I try before I buy?,can i try before i buy,try buy,"[try, buy]",tri buy
1,14 day trial,How can I try the offers?,how can i try the offers,try offers,"[try, offer]",tri offer
2,14 day trial,How can I try the app?,how can i try the app,try app,"[try, app]",tri app
3,14 day trial,Can I try first?,can i try first,try first,"[try, first]",tri first
4,14 day trial,Can I try an offer before I buy?,can i try an offer before i buy,try offer buy,"[try, offer, buy]",tri offer buy


In [0]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
print(stemmer.stem("Blessing"))

In [0]:
def stem_text(text):
    return [stemmer.stem(w) for w in w_tokenizer.tokenize(text)]

In [0]:
df_types_pd['text_stemmed'] = df_types_pd.stopwords_removed.apply(stem_text)
df_types_pd.head()

Unnamed: 0,Categroy,Questions,cleaned_text,stopwords_removed,text_lemmatized,text_stemmed
0,14 day trial,Can I try before I buy?,can i try before i buy,try buy,"[try, buy]","[tri, buy]"
1,14 day trial,How can I try the offers?,how can i try the offers,try offers,"[try, offer]","[tri, offer]"
2,14 day trial,How can I try the app?,how can i try the app,try app,"[try, app]","[tri, app]"
3,14 day trial,Can I try first?,can i try first,try first,"[try, first]","[tri, first]"
4,14 day trial,Can I try an offer before I buy?,can i try an offer before i buy,try offer buy,"[try, offer, buy]","[tri, offer, buy]"


In [0]:
#df_types_pd.tail(50)

In [0]:
df_types_pd['text_lemmatized']=df_types_pd['text_lemmatized'].apply(lambda x: " ".join(a for a in x))
#df_types_pd['text_stemmed']=df_types_pd['text_stemmed'].apply(lambda x: " ".join(a for a in x))
df_types_pd.head()

Unnamed: 0,Categroy,Questions,cleaned_text,stopwords_removed,text_lemmatized,text_stemmed
0,14 day trial,Can I try before I buy?,can i try before i buy,try buy,try buy,tri buy
1,14 day trial,How can I try the offers?,how can i try the offers,try offers,try offer,tri offer
2,14 day trial,How can I try the app?,how can i try the app,try app,try app,tri app
3,14 day trial,Can I try first?,can i try first,try first,try first,tri first
4,14 day trial,Can I try an offer before I buy?,can i try an offer before i buy,try offer buy,try offer buy,tri offer buy


In [0]:
from sklearn.feature_extraction.text import CountVectorizer


training_texts =df_types_pd['text_lemmatized']

test_texts = ['how can i try the app']

# this vectorizer will skip stop words
vectorizer = CountVectorizer(
    stop_words="english",
    preprocessor=clean_text
)

# fit the vectorizer on the training text
vectorizer.fit(training_texts)

# get the vectorizer's vocabulary
inv_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}
vocabulary = [inv_vocab[i] for i in range(len(inv_vocab))]

# vectorization example
# vectorization example
pd.DataFrame(
    data=vectorizer.transform(test_texts).toarray(),
    index=["test sentence"],
    columns=vocabulary
)

Unnamed: 0,14,2019,2020,241,25,able,abu,accept,access,accident,account,acivate,activate,activation,active,add,address,adrenaline,advise,alcohol,allow,allowance,amend,answer,anymore,app,appear,apply,apps,arent,arrive,assist,available,availble,away,balance,beauty,benefiit,benefit,better,...,track,travel,trial,try,turn,typo,unable,unlock,update,url,use,user,username,valid,validation,validity,value,ve,vendor,verification,verify,version,view,vip,voucher,vouchers,vouchr,wan,want,wasn,way,ways,website,whats,wiating,wifes,wifi,work,wrong,year
test sentence,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
data=vectorizer.transform(training_texts).toarray()
data