In [0]:
! /databricks/python/bin/pip install nltk

In [0]:
! /databricks/python/bin/python -m nltk.downloader stopwords

In [0]:
! /databricks/python/bin/python -m nltk.downloader punkt

In [0]:
dbutils.library.installPyPI("fuzzywuzzy")
dbutils.library.installPyPI("nltk")
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 
import nltk
from pyspark.sql.functions import *
from nltk.corpus import stopwords
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [0]:
def compute_match(x,list_of_questions):
    if(len(list_of_questions)>0):
        result_list=[(fuzz.token_set_ratio(each,x),each) for each in list_of_questions]
        best_value=sorted(result_list, key=lambda x: x[0],reverse=True)
        return best_value[0]

In [0]:
df_types = spark.table("irumdb.cs_types_list")
df_types=df_types.filter(df_types.Questions.isNotNull())
df_types=df_types.na.replace(['14 Day trial '], [None], 'Questions')
df_types=df_types.filter(df_types.Questions.isNotNull())

# df_types_agg = df_types.groupby('Categroy').agg(collect_set('Questions')).sort(['Categroy'],ascending=True)
# df_types_pd=df_types_agg.toPandas()
# df_types_pd.rename(columns={"collect_set(Questions)":"list_of_questions"},inplace=True)

df_types.show()

In [0]:
df_types_pd = df_types.toPandas()
df_types_pd['lower_questions'] = df_types_pd['Questions'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df_types_pd['lower_questions'] = df_types_pd['lower_questions'].str.replace('[^\w\s]','')
stop = stopwords.words('english')
df_types_pd['stopwords_removed'] = df_types_pd['lower_questions'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df_types_pd['word_tokens'] = df_types_pd['stopwords_removed'].apply(lambda x: (word_tokenize(x)))
df_types_pd.head()

Unnamed: 0,Categroy,Questions,lower_questions,stopwords_removed,word_tokens
0,14 day trial,Can I try before I buy?,can i try before i buy,try buy,"[try, buy]"
1,14 day trial,How can I try the offers?,how can i try the offers,try offers,"[try, offers]"
2,14 day trial,How can I try the app?,how can i try the app,try app,"[try, app]"
3,14 day trial,Can I try first?,can i try first,try first,"[try, first]"
4,14 day trial,Can I try an offer before I buy?,can i try an offer before i buy,try offer buy,"[try, offer, buy]"


In [0]:
df_types_sp=spark.createDataFrame(df_types_pd)
df_types_agg = df_types_sp.groupby('Categroy').agg(collect_set('stopwords_removed')).sort(['Categroy'],ascending=True)
df_types_agg.show()

In [0]:
# retrieve Chats
df_chats= spark.table("live_admin_chat.chatmessages")
df_chats_pd=df_chats.toPandas()

# taking first 100
new_df=df_chats_pd[["ChatId","MessageText","MessageBy"]][:100].groupby(["ChatId","MessageBy"],as_index=False).aggregate(lambda x: list(x))
new_df.head()

Unnamed: 0,ChatId,MessageBy,MessageText
0,51902,Cheryl,[Do I still have you with me? ]
1,51903,Cheryl,"[Hi, welcome to the ENTERTAINER Live Chat. How..."
2,51903,Haroon,[قبل ان نبدأ، هل لي بمعرفة اسمك الكامل وعنوان ...
3,51903,Visitor2178963,"[انا نسيت كلمه السر, حمد الخروصي, Fox-hamoody@..."
4,51903,Visitor2249-2178963CID51903,[[NAVIGATEURL] Visitor browsing : https://hub....


In [0]:
new_df['messages_split']=new_df['MessageText'].apply(lambda x: " ".join(a for a in x))
new_df=new_df[~(new_df.messages_split.str.contains('Hi, welcome to the ENTERTAINER Live Chat'))]
new_df=new_df[~(new_df.messages_split.str.contains('Your patience is much appreciated'))]
new_df=new_df[~(new_df.messages_split.str.contains('NAVIGATEURL'))]
new_df.head()

Unnamed: 0,ChatId,MessageBy,MessageText,messages_split
0,51902,Cheryl,[Do I still have you with me? ],Do I still have you with me?
2,51903,Haroon,[قبل ان نبدأ، هل لي بمعرفة اسمك الكامل وعنوان ...,قبل ان نبدأ، هل لي بمعرفة اسمك الكامل وعنوان ...
3,51903,Visitor2178963,"[انا نسيت كلمه السر, حمد الخروصي, Fox-hamoody@...",انا نسيت كلمه السر حمد الخروصي Fox-hamoody@hot...
6,51906,Taher Al-Sayeh,"[[NICKCHANGE], The app is not reflecting the f...",[NICKCHANGE] The app is not reflecting the fac...
7,51906,Visitor2185679,"[I was just talking to Harry, He requested scr...",I was just talking to Harry He requested scree...


In [0]:
new_df.reset_index(drop=True,inplace = True) 
new_df_sp = spark.createDataFrame(new_df)
display(new_df_sp)

ChatId,MessageBy,MessageText,messages_split
51902,Cheryl,List(Do I still have you with me? ),Do I still have you with me?
51903,Haroon,"List(قبل ان نبدأ، هل لي بمعرفة اسمك الكامل وعنوان بريدك الإلكتروني من فضلك؟ هذا سيساعد في متابعة استفسارك ومساعدتك بشكل أفضل, شكراً لمشاركة بريدك الإلكتروني معنا , يمكنك تغيير كلمة السر الخاصة بحسابك عن طريق الرابط التالي https://www.theentertainerme.com/default/forgotpassword, هل هناك شيء آخر يمكنني مساعدتك به؟)",قبل ان نبدأ، هل لي بمعرفة اسمك الكامل وعنوان بريدك الإلكتروني من فضلك؟ هذا سيساعد في متابعة استفسارك ومساعدتك بشكل أفضل شكراً لمشاركة بريدك الإلكتروني معنا يمكنك تغيير كلمة السر الخاصة بحسابك عن طريق الرابط التالي https://www.theentertainerme.com/default/forgotpassword هل هناك شيء آخر يمكنني مساعدتك به؟
51903,Visitor2178963,"List(انا نسيت كلمه السر, حمد الخروصي, Fox-hamoody@hotmail.com)",انا نسيت كلمه السر حمد الخروصي Fox-hamoody@hotmail.com
51906,Taher Al-Sayeh,"List([NICKCHANGE], The app is not reflecting the fact I purchased QATAR 2021, http://files.thelivechatsoftware.com/5303BC77_DA39_4CB5_8A16_B9EB350219F0vMp8Zwi9T(557.46kb).png, http://files.thelivechatsoftware.com/A785218B_0558_4841_8680_157B7A6B5B7FMskw0P7PM(801.46kb).png, I already deleted and reinstalled the app twice, just an FYI, Then why is my home page not reflecting the promotions, And why does it still prompt me to purchase Qatar 2021, Can you please follow up on my inquiry and update me via email, This is taking longer than expected and I am sleepy, Please update me via email. I would like to head to bed please.)","[NICKCHANGE] The app is not reflecting the fact I purchased QATAR 2021 http://files.thelivechatsoftware.com/5303BC77_DA39_4CB5_8A16_B9EB350219F0vMp8Zwi9T(557.46kb).png http://files.thelivechatsoftware.com/A785218B_0558_4841_8680_157B7A6B5B7FMskw0P7PM(801.46kb).png I already deleted and reinstalled the app twice, just an FYI Then why is my home page not reflecting the promotions And why does it still prompt me to purchase Qatar 2021 Can you please follow up on my inquiry and update me via email This is taking longer than expected and I am sleepy Please update me via email. I would like to head to bed please."
51906,Visitor2185679,"List(I was just talking to Harry, He requested screenshots but the chat box closed, It’s Taher Al-Sayeh, email: Sayeh.taher@gmail.com)","I was just talking to Harry He requested screenshots but the chat box closed It’s Taher Al-Sayeh, email: Sayeh.taher@gmail.com"
51927,Visitor2186357,"List(لماذا لا توجد فنادق في مسقط, حمد سالم, Fox-hamoody@hotmail.com, احتاج فنادق في مسقط, انا عريس جديد اود ان احجز عده فنادق, ولكن لا اجد اي فندق في سلطان عمان, طيب, اغلب الاشياء اختفت من البرنامج لماذا, لماذا لا تتوفر, في اي فندق اجد تخفيصات في عمان, لا تضهر, طيب)",لماذا لا توجد فنادق في مسقط حمد سالم Fox-hamoody@hotmail.com احتاج فنادق في مسقط انا عريس جديد اود ان احجز عده فنادق ولكن لا اجد اي فندق في سلطان عمان طيب اغلب الاشياء اختفت من البرنامج لماذا لماذا لا تتوفر في اي فندق اجد تخفيصات في عمان لا تضهر طيب
51954,Afraa Deeb,"List(Hi, Afraa Deeb, afraa74@yahoo.com, Before a few minutes I talk to you, For change my offer, I want to change from Dubai classic to Dubai gourmet)",Hi Afraa Deeb afraa74@yahoo.com Before a few minutes I talk to you For change my offer I want to change from Dubai classic to Dubai gourmet
51979,Afraa Deeb,"List(Omg, This 3rd time I talk to you, I want change from Dubai classic to Dubai gourmet, Afraa Deeb, afraa74@yahoo.com, Ok, Ok, OK, Okay)",Omg This 3rd time I talk to you I want change from Dubai classic to Dubai gourmet Afraa Deeb afraa74@yahoo.com Ok Ok OK Okay
51979,Carl Smith,List([QcRatedChat] Carl Smith gave rating 3 to Harry's chat on theentertainerme.com/en /LA),[QcRatedChat] Carl Smith gave rating 3 to Harry's chat on theentertainerme.com/en /LA
52009,7491,List([TRANSFERCHAT]_Transferred By Cheryl to Harry (English (en)) (LA)),[TRANSFERCHAT]_Transferred By Cheryl to Harry (English (en)) (LA)


In [0]:
new_df_sp = new_df_sp.select('ChatId','messages_split').groupby('ChatId').agg(collect_set('messages_split')).sort(['ChatId'],ascending=True)
new_df_sp.show()

In [0]:
display(new_df_sp)

ChatId,collect_set(messages_split)
51902,List(Do I still have you with me? )
51903,"List(انا نسيت كلمه السر حمد الخروصي Fox-hamoody@hotmail.com, قبل ان نبدأ، هل لي بمعرفة اسمك الكامل وعنوان بريدك الإلكتروني من فضلك؟ هذا سيساعد في متابعة استفسارك ومساعدتك بشكل أفضل شكراً لمشاركة بريدك الإلكتروني معنا يمكنك تغيير كلمة السر الخاصة بحسابك عن طريق الرابط التالي https://www.theentertainerme.com/default/forgotpassword هل هناك شيء آخر يمكنني مساعدتك به؟)"
51906,"List([NICKCHANGE] The app is not reflecting the fact I purchased QATAR 2021 http://files.thelivechatsoftware.com/5303BC77_DA39_4CB5_8A16_B9EB350219F0vMp8Zwi9T(557.46kb).png http://files.thelivechatsoftware.com/A785218B_0558_4841_8680_157B7A6B5B7FMskw0P7PM(801.46kb).png I already deleted and reinstalled the app twice, just an FYI Then why is my home page not reflecting the promotions And why does it still prompt me to purchase Qatar 2021 Can you please follow up on my inquiry and update me via email This is taking longer than expected and I am sleepy Please update me via email. I would like to head to bed please., I was just talking to Harry He requested screenshots but the chat box closed It’s Taher Al-Sayeh, email: Sayeh.taher@gmail.com)"
51927,List(لماذا لا توجد فنادق في مسقط حمد سالم Fox-hamoody@hotmail.com احتاج فنادق في مسقط انا عريس جديد اود ان احجز عده فنادق ولكن لا اجد اي فندق في سلطان عمان طيب اغلب الاشياء اختفت من البرنامج لماذا لماذا لا تتوفر في اي فندق اجد تخفيصات في عمان لا تضهر طيب)
51954,List(Hi Afraa Deeb afraa74@yahoo.com Before a few minutes I talk to you For change my offer I want to change from Dubai classic to Dubai gourmet)
51979,"List([QcRatedChat] Carl Smith gave rating 3 to Harry's chat on theentertainerme.com/en /LA, Omg This 3rd time I talk to you I want change from Dubai classic to Dubai gourmet Afraa Deeb afraa74@yahoo.com Ok Ok OK Okay)"
52009,"List([TRANSFERCHAT]_Transferred By Cheryl to Harry (English (en)) (LA), How can I see the offers for hsbc entertainer 2021? Full name: Goh Chiew Yee Constance and email address: constancegoh1992@gmail.com, Welcome to ENTERTAINER with HSBC chat support. How may I assist you? May I have your full name and email address, please? This helps us in keeping a track of your inquiry and assisting you better., Thank you for sharing your details, makes it easy for me to work on your inquiry!, [NICKCHANGE])"


In [0]:
new_df_pd=new_df_sp.toPandas()
new_df_pd.rename(columns={"collect_set(messages_split)":"list_of_chats"},inplace=True)
new_df_pd['list_of_chats']=new_df_pd['list_of_chats'].apply(lambda x: " ".join(a for a in x))
new_df_pd['lower_chats'] = new_df_pd['list_of_chats'].apply(lambda x: " ".join(x.lower() for x in x.split()))
stop = stopwords.words('english')
new_df_pd['stopwords_removed'] = new_df_pd['lower_chats'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
new_df_pd['stopwords_removed'] = new_df_pd['stopwords_removed'].str.replace('[^\w\s]','')
new_df_pd.head()

Unnamed: 0,ChatId,list_of_chats,lower_chats,stopwords_removed
0,51902,Do I still have you with me?,do i still have you with me?,still me
1,51903,انا نسيت كلمه السر حمد الخروصي Fox-hamoody@hot...,انا نسيت كلمه السر حمد الخروصي fox-hamoody@hot...,انا نسيت كلمه السر حمد الخروصي foxhamoodyhotma...
2,51906,[NICKCHANGE] The app is not reflecting the fac...,[nickchange] the app is not reflecting the fac...,nickchange app reflecting fact purchased qatar...
3,51927,لماذا لا توجد فنادق في مسقط حمد سالم Fox-hamoo...,لماذا لا توجد فنادق في مسقط حمد سالم fox-hamoo...,لماذا لا توجد فنادق في مسقط حمد سالم foxhamood...
4,51954,Hi Afraa Deeb afraa74@yahoo.com Before a few m...,hi afraa deeb afraa74@yahoo.com before a few m...,hi afraa deeb afraa74yahoocom minutes talk cha...


In [0]:
df_chats=spark.createDataFrame(new_df_pd)
df_chats=df_chats.select('ChatId','stopwords_removed')
df_chats.show()

In [0]:
df_types_agg = df_types_agg.withColumnRenamed("collect_set(stopwords_removed)","list_of_questions")
df_types_agg.show()

In [0]:
df_types_agg_pd=df_types_agg.toPandas()
df_types_agg_pd.head()

Unnamed: 0,Categroy,list_of_questions
0,14 day trial,"[cant access 14day trial, try app, extend 14 d..."
1,25% Offers UAE,"[rules conditions user 25 cant, case allowed u..."
2,Account Questions,[purchased app gift didnt receive activation k...
3,Adrenaline,"[want redeem adrenaline voucher, access adrena..."
4,Cheers,"[find cheers offers, many cheers offers use di..."


In [0]:
df_chats_pd=df_chats.toPandas()
df_chats_pd.head()

Unnamed: 0,ChatId,stopwords_removed
0,51902,still me
1,51903,انا نسيت كلمه السر حمد الخروصي foxhamoodyhotma...
2,51906,nickchange app reflecting fact purchased qatar...
3,51927,لماذا لا توجد فنادق في مسقط حمد سالم foxhamood...
4,51954,hi afraa deeb afraa74yahoocom minutes talk cha...


In [0]:
df_types_agg_pd['message_1']=df_chats_pd.iloc[2].stopwords_removed
df_types_agg_pd.head()

Unnamed: 0,Categroy,list_of_questions,message_1
0,14 day trial,"[cant access 14day trial, try app, extend 14 d...",nickchange app reflecting fact purchased qatar...
1,25% Offers UAE,"[rules conditions user 25 cant, case allowed u...",nickchange app reflecting fact purchased qatar...
2,Account Questions,[purchased app gift didnt receive activation k...,nickchange app reflecting fact purchased qatar...
3,Adrenaline,"[want redeem adrenaline voucher, access adrena...",nickchange app reflecting fact purchased qatar...
4,Cheers,"[find cheers offers, many cheers offers use di...",nickchange app reflecting fact purchased qatar...


In [0]:
df_types_agg_pd['list_of_questions']=df_types_agg_pd['list_of_questions'].apply(lambda x: " ".join(a for a in x))
df_types_agg_pd.head()

Unnamed: 0,Categroy,list_of_questions,message_1
0,14 day trial,cant access 14day trial try app extend 14 day ...,nickchange app reflecting fact purchased qatar...
1,25% Offers UAE,rules conditions user 25 cant case allowed use...,nickchange app reflecting fact purchased qatar...
2,Account Questions,purchased app gift didnt receive activation ke...,nickchange app reflecting fact purchased qatar...
3,Adrenaline,want redeem adrenaline voucher access adrenali...,nickchange app reflecting fact purchased qatar...
4,Cheers,find cheers offers many cheers offers use dini...,nickchange app reflecting fact purchased qatar...


In [0]:
df_types_agg_pd.shape

In [0]:
tfidf = TfidfVectorizer(stop_words='english')
df_types_agg_pd['list_of_questions'] = df_types_agg_pd['list_of_questions'].fillna('')
tfidf_matrix_q = tfidf.fit_transform(df_types_agg_pd['list_of_questions'])
print(tfidf_matrix_q.shape)
df_types_agg_pd['message_1'] = df_types_agg_pd['message_1'].fillna('')
tfidf_matrix_c = tfidf.fit_transform(df_types_agg_pd['message_1'])
print(tfidf_matrix_c.shape)

In [0]:
cosine_sim = linear_kernel(tfidf_matrix_q, tfidf_matrix_c)
cosine_sim