In [0]:
dbutils.library.installPyPI("fuzzywuzzy")
# dbutils.library.installPyPI("Keras")
# dbutils.library.installPyPI("nltk")
# dbutils.library.installPyPI("numpy")

In [0]:
! /databricks/python/bin/pip install --upgrade pip #pip install googletrans
! /databricks/python/bin/pip install nltk
! /databricks/python/bin/python -m nltk.downloader stopwords
! /databricks/python/bin/python -m nltk.downloader punkt
! /databricks/python/bin/python -m nltk.downloader wordnet

In [0]:
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 
import nltk
from pyspark.sql.functions import *
from nltk.corpus import stopwords
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import re
from keras.preprocessing.text import Tokenizer
import tensorflow as tf

## Bulding a text summarizer to summerize text messages to keep the most important stuff in the chat and discard the irrelevant stuff as it will not add any meaning.

### Proof of Concept for Text Summarizer

In [0]:
# Input text - to summarize  
text = """There are many techniques available to generate extractive summarization to keep it simple, I will be using an unsupervised learning approach to find the sentences similarity and rank them. Summarization can be defined as a task of producing a concise and fluent summary while preserving key information and overall meaning. One benefit of this will be, you don’t need to train and build a model prior start using it for your project. It’s good to understand Cosine similarity to make the best use of the code you are going to see. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. Its measures cosine of the angle between vectors. The angle will be 0 if sentences are similar."""
   

In [0]:
# Tokenizing the text 
stopWords = set(stopwords.words("english")) 
words = word_tokenize(text)
words

In [0]:
sentences = sent_tokenize(text) 
sentences

In [0]:
# Creating a frequency table to keep the  
# score of each word 
freqTable = dict() 
for word in words: 
    word = word.lower() 
    if word in stopWords: 
        continue
    if word in freqTable: 
        freqTable[word] += 1
    else: 
        freqTable[word] = 1

In [0]:
freqTable

In [0]:
len(freqTable)

In [0]:
# Creating a dictionary to keep the score 
# of each sentence 
sentences = sent_tokenize(text) 
sentenceValue = dict() 
   
for sentence in sentences: 
    for word, freq in freqTable.items(): 
        if word in sentence.lower(): 
            if sentence in sentenceValue: 
                sentenceValue[sentence] += freq 
            else: 
                sentenceValue[sentence] = freq 

In [0]:
sentenceValue

In [0]:
len(sentenceValue)

In [0]:
sumValues = 0
for sentence in sentenceValue: 
    sumValues += sentenceValue[sentence] 

In [0]:
sumValues

In [0]:
# Average value of a sentence from the original text 
average = int(sumValues / len(sentenceValue)) 
average

In [0]:
1.2 * average

In [0]:
# Storing sentences into our summary. 
summary = '' 
for sentence in sentences: 
    if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)): 
        summary += " " + sentence 
print(summary) 

In [0]:
def chat_summerizer(text):
  # Tokenizing the text 
  stopWords = set(stopwords.words("english")) 
  words = word_tokenize(text)
  sentences = sent_tokenize(text) 
  
  # Creating a frequency table to keep the score of each word 
  freqTable = dict() 
  for word in words: 
      word = word.lower() 
      if word in stopWords: 
          continue
      if word in freqTable: 
          freqTable[word] += 1
      else: 
          freqTable[word] = 1

  # Creating a dictionary to keep the score of each sentence 
  sentences = sent_tokenize(text) 
  sentenceValue = dict() 
   
  for sentence in sentences: 
      for word, freq in freqTable.items(): 
          if word in sentence.lower(): 
              if sentence in sentenceValue: 
                  sentenceValue[sentence] += freq 
              else: 
                  sentenceValue[sentence] = freq 
                  
  sumValues = 0
  for sentence in sentenceValue: 
      sumValues += sentenceValue[sentence] 
      
  # Average value of a sentence from the original text 
  average = int(sumValues / len(sentenceValue)) 
  
  # Storing sentences into our summary
  summary = '' 
  for sentence in sentences: 
      if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)): 
          summary += " " + sentence 
  # print(summary)
  return summary

In [0]:
def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text
  
def lemmatize_text(text):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]

def pre_processing(df,col_name):
  df['cleaned_text'] = df[col_name].apply(lambda x: (clean_text(x)))
  df['cleaned_text'] = df['cleaned_text'].str.replace('[^\w\s]','')
  stop = stopwords.words('english')
  df['stopwords_removed'] = df['cleaned_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
  df['text_lemmatized'] = df.stopwords_removed.apply(lemmatize_text)
  df['text_lemmatized']=df['text_lemmatized'].apply(lambda x: " ".join(a for a in x))
  # df['text_lemmatized'] = df['text_lemmatized'].str.replace('[^\w\s]','')
  return df

def prep_train_data():
  df_types = spark.table("irumdb.cs_types_list")
  df_types=df_types.filter(df_types.Questions.isNotNull())
  df_types=df_types.na.replace(['14 Day trial '], [None], 'Questions')
  df_types=df_types.filter(df_types.Questions.isNotNull())
  
  df_types_pd = df_types.toPandas()
  final_df = pre_processing(df_types_pd,col_name='Questions')
  return final_df
  
def prep_chat_data():
  df_chats= spark.table("live_admin_chat.chatmessages")
  df_chats_pd=df_chats.toPandas()
  
  # TAKING ONLY FIRST 100 CHATS FOR CHECKING THE CONCEPT
  new_df=df_chats_pd[["ChatId","MessageText","MessageBy"]][100:200].groupby(["ChatId","MessageBy"],as_index=False).aggregate(lambda x: list(x))
  new_df['messages_split']=new_df['MessageText'].apply(lambda x: " ".join(a for a in x))
  new_df=new_df[~(new_df.messages_split.str.contains('Hi, welcome to the ENTERTAINER Live Chat'))]
  new_df=new_df[~(new_df.messages_split.str.contains('Your patience is much appreciated'))]
  new_df=new_df[~(new_df.messages_split.str.contains('NAVIGATEURL'))]
  new_df.reset_index(drop=True,inplace = True)
  
  new_df_sp = spark.createDataFrame(new_df)
  new_df_sp = new_df_sp.select('ChatId','messages_split').groupby('ChatId').agg(collect_set('messages_split')).sort(['ChatId'],ascending=True)
  
  new_df_pd=new_df_sp.toPandas()
  new_df_pd.rename(columns={"collect_set(messages_split)":"list_of_chats"},inplace=True)
  new_df_pd['list_of_chats']=new_df_pd['list_of_chats'].apply(lambda x: " ".join(a for a in x))
  
  new_df_pd['lower_chats'] = new_df_pd['list_of_chats'].apply(lambda x: " ".join(x.lower() for x in x.split()))
  # FIRST TRY  
  # TEXT SUMMERIZER
  new_df_pd['summarized_text'] = new_df_pd['lower_chats'].apply(lambda x: (chat_summerizer(x)))
  
  stop = stopwords.words('english')
  # new_df_pd['stopwords_removed'] = new_df_pd['lower_chats'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
  new_df_pd['stopwords_removed'] = new_df_pd['summarized_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
  new_df_pd['stopwords_removed'] = new_df_pd['stopwords_removed'].str.replace('[^\w\s]','')
  freq=['nickchange','u','ok','okay','hi','thx','hello','entertainer','thanks','thankyou','thank you']
  new_df_pd['most_freq_removed'] = new_df_pd['stopwords_removed'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
  
  df_chats=pre_processing(new_df_pd,col_name='most_freq_removed')
  
#   # SECOND TRY TOTAL FLOP
#   # TEXT SUMMERIZER
#   df_chats['summarized_text'] = df_chats['most_freq_removed'].apply(lambda x: (chat_summerizer(x)))
  
  # df_chats=spark.createDataFrame(new_df_pd)
  # df_chats=df_chats.select('ChatId','most_freq_removed')
  return df_chats
  

In [0]:
df_quest = prep_train_data()
df_quest.head()

Unnamed: 0,Categroy,Questions,cleaned_text,stopwords_removed,text_lemmatized
0,14 day trial,Can I try before I buy?,can i try before i buy,try buy,try buy
1,14 day trial,How can I try the offers?,how can i try the offers,try offers,try offer
2,14 day trial,How can I try the app?,how can i try the app,try app,try app
3,14 day trial,Can I try first?,can i try first,try first,try first
4,14 day trial,Can I try an offer before I buy?,can i try an offer before i buy,try offer buy,try offer buy


In [0]:
df_chats = prep_chat_data()
df_chats.head()

Unnamed: 0,ChatId,list_of_chats,lower_chats,summarized_text,stopwords_removed,most_freq_removed,cleaned_text,text_lemmatized
0,54137,Hi I spoke to Stella earlier regarding my rece...,hi i spoke to stella earlier regarding my rece...,hi i spoke to stella earlier regarding my rec...,spoke stella earlier regarding recent gourmet ...,spoke stella earlier regarding recent gourmet ...,spoke stella earlier regarding recent gourmet ...,speak stella earlier regard recent gourmet 202...
1,54141,Hello. This is my 3rd time on the chat I want ...,hello. this is my 3rd time on the chat i want ...,this is my 3rd time on the chat i want to cha...,3rd time chat want change package disconnected...,3rd time chat want change package disconnected...,3rd time chat want change package disconnected...,3rd time chat want change package disconnect e...
2,54143,"Hello there, Do i get repeated offers from sam...","hello there, do i get repeated offers from sam...",,,,,
3,54144,مرحبًا بك معنا مجددًا كان لديك 770 ابتسامة وا...,مرحبًا بك معنا مجددًا كان لديك 770 ابتسامة وان...,هل هناك أي شيء آخر يمكنني مساعدتك به ؟ [trans...,هل هناك أي شيء آخر يمكنني مساعدتك به transferc...,هل هناك أي شيء آخر يمكنني مساعدتك به transferc...,هل هناك أي شيء آخر يمكنني مساعدتك به transferc...,هل هناك أي شيء آخر يمكنني مساعدتك به transferc...
4,54145,Hi I was chatting with Greta How do I call you...,hi i was chatting with greta how do i call you...,hi i was chatting with greta how do i call yo...,chatting greta call might easier purchase 2021...,chatting greta call might easier purchase 2021...,chatting greta call might easier purchase 2021...,chat greta call might easier purchase 2021 bun...


In [0]:
df_chats['text_lemmatized'].values

In [0]:
# FIRST TRY WITH JUST SUMMERIZER
df_chats = prep_chat_data()
df_chats.head(10)

Unnamed: 0,ChatId,list_of_chats,lower_chats,summarized_text,stopwords_removed,most_freq_removed,cleaned_text,text_lemmatized
0,54137,Hi I spoke to Stella earlier regarding my rece...,hi i spoke to stella earlier regarding my rece...,hi i spoke to stella earlier regarding my rec...,spoke stella earlier regarding recent gourmet ...,spoke stella earlier regarding recent gourmet ...,spoke stella earlier regarding recent gourmet ...,speak stella earlier regard recent gourmet 202...
1,54141,Hello. This is my 3rd time on the chat I want ...,hello. this is my 3rd time on the chat i want ...,this is my 3rd time on the chat i want to cha...,3rd time chat want change package disconnected...,3rd time chat want change package disconnected...,3rd time chat want change package disconnected...,3rd time chat want change package disconnect e...
2,54143,"Hello there, Do i get repeated offers from sam...","hello there, do i get repeated offers from sam...",,,,,
3,54144,مرحبًا بك معنا مجددًا كان لديك 770 ابتسامة وا...,مرحبًا بك معنا مجددًا كان لديك 770 ابتسامة وان...,هل هناك أي شيء آخر يمكنني مساعدتك به ؟ [trans...,هل هناك أي شيء آخر يمكنني مساعدتك به transferc...,هل هناك أي شيء آخر يمكنني مساعدتك به transferc...,هل هناك أي شيء آخر يمكنني مساعدتك به transferc...,هل هناك أي شيء آخر يمكنني مساعدتك به transferc...
4,54145,Hi I was chatting with Greta How do I call you...,hi i was chatting with greta how do i call you...,hi i was chatting with greta how do i call yo...,chatting greta call might easier purchase 2021...,chatting greta call might easier purchase 2021...,chatting greta call might easier purchase 2021...,chat greta call might easier purchase 2021 bun...
5,54147,Hi I cannot see any offer in Food buy 1 get 1 ...,hi i cannot see any offer in food buy 1 get 1 ...,,,,,
6,54148,"Hello, I bought Entertainer dubai classic yest...","hello, i bought entertainer dubai classic yest...",jeeyeon an/ ajycom1120@gmail.com and my order...,jeeyeon ajycom1120gmailcom order number 101387...,jeeyeon an ajycom1120gmailcom order number 101...,jeeyeon an ajycom1120gmailcom order number 101...,jeeyeon ajycom1120gmailcom order number 101387...


In [0]:
df_chats['lower_chats'].values[1]

In [0]:
df_chats['summarized_text'].values[1]

In [0]:
df_chats['lower_chats'].values[2]

In [0]:
df_chats['summarized_text'].values[2]

In [0]:
df_chats['lower_chats'].values[3]

In [0]:
df_chats['summarized_text'].values[3]

In [0]:
df_chats['lower_chats'].values[4]

In [0]:
df_chats['summarized_text'].values[4]

In [0]:
# # SECOND TRY
# df_chats = prep_chat_data()
# df_chats.head(10)

In [0]:
# Drop any rows with feature col empty
df_chats=df_chats.drop(df_chats.index[[2,3,5]])
df_chats.reset_index(drop=True,inplace = True)
df_chats.head()

Unnamed: 0,ChatId,list_of_chats,lower_chats,summarized_text,stopwords_removed,most_freq_removed,cleaned_text,text_lemmatized
0,54137,Hi I spoke to Stella earlier regarding my rece...,hi i spoke to stella earlier regarding my rece...,hi i spoke to stella earlier regarding my rec...,spoke stella earlier regarding recent gourmet ...,spoke stella earlier regarding recent gourmet ...,spoke stella earlier regarding recent gourmet ...,speak stella earlier regard recent gourmet 202...
1,54141,Hello. This is my 3rd time on the chat I want ...,hello. this is my 3rd time on the chat i want ...,this is my 3rd time on the chat i want to cha...,3rd time chat want change package disconnected...,3rd time chat want change package disconnected...,3rd time chat want change package disconnected...,3rd time chat want change package disconnect e...
2,54145,Hi I was chatting with Greta How do I call you...,hi i was chatting with greta how do i call you...,hi i was chatting with greta how do i call yo...,chatting greta call might easier purchase 2021...,chatting greta call might easier purchase 2021...,chatting greta call might easier purchase 2021...,chat greta call might easier purchase 2021 bun...
3,54148,"Hello, I bought Entertainer dubai classic yest...","hello, i bought entertainer dubai classic yest...",jeeyeon an/ ajycom1120@gmail.com and my order...,jeeyeon ajycom1120gmailcom order number 101387...,jeeyeon an ajycom1120gmailcom order number 101...,jeeyeon an ajycom1120gmailcom order number 101...,jeeyeon ajycom1120gmailcom order number 101387...


In [0]:
df_quest['cleaned_text'] = df_quest['cleaned_text'].str.replace('[^\w\s]','')

In [0]:
df_quest.head()

Unnamed: 0,Categroy,Questions,cleaned_text,stopwords_removed,text_lemmatized
0,14 day trial,Can I try before I buy?,can i try before i buy,try buy,try buy
1,14 day trial,How can I try the offers?,how can i try the offers,try offers,try offer
2,14 day trial,How can I try the app?,how can i try the app,try app,try app
3,14 day trial,Can I try first?,can i try first,try first,try first
4,14 day trial,Can I try an offer before I buy?,can i try an offer before i buy,try offer buy,try offer buy


In [0]:
# COUNT OF QUESTIONS IN EACH CATEGORIES
df_quest.groupby(['Categroy'], sort = False).count() 

Unnamed: 0_level_0,Questions,cleaned_text,stopwords_removed,text_lemmatized
Categroy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14 day trial,33,33,33,33
General App Questions,100,100,100,100
Cheers,17,17,17,17
Account Questions,41,41,41,41
Complaints,12,12,12,12
Family Accounts,134,134,134,134
General,70,70,70,70
Getaways,67,67,67,67
Monthly Offers,11,11,11,11
Pings,67,67,67,67


In [0]:
# KEEP FIRST ENTRY FROM EACH GROUP FOR TESTING
df_test=df_quest.groupby('Categroy').first().reset_index()
df_test.head()

Unnamed: 0,Categroy,Questions,cleaned_text,stopwords_removed,text_lemmatized
0,14 day trial,Can I try before I buy?,can i try before i buy,try buy,try buy
1,25% Offers UAE,How do I use my 25% off offers?,how do i use my 25 off offers,use 25 offers,use 25 offer
2,Account Questions,"I need to change my email address, please advi...",i need to change my email address please advi...,need change email address please advise,need change email address please advise
3,Adrenaline,How do I access Adrenaline offers?,how do i access adrenaline offers,access adrenaline offers,access adrenaline offer
4,Cheers,How do I access my Cheers offers?,how do i access my cheers offers,access cheers offers,access cheer offer


In [0]:
from sklearn.preprocessing import LabelEncoder
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df_test['labels'] = labelencoder.fit_transform(df_test['Categroy'])
df_test.head()

Unnamed: 0,Categroy,Questions,cleaned_text,stopwords_removed,text_lemmatized,labels
0,14 day trial,Can I try before I buy?,can i try before i buy,try buy,try buy,0
1,25% Offers UAE,How do I use my 25% off offers?,how do i use my 25 off offers,use 25 offers,use 25 offer,1
2,Account Questions,"I need to change my email address, please advi...",i need to change my email address please advi...,need change email address please advise,need change email address please advise,2
3,Adrenaline,How do I access Adrenaline offers?,how do i access adrenaline offers,access adrenaline offers,access adrenaline offer,3
4,Cheers,How do I access my Cheers offers?,how do i access my cheers offers,access cheers offers,access cheer offer,4


In [0]:
# DROP FIRST ENTRY FROM EACH GROUP AND KEEP THE REST FOR TRAINING
df_train=df_quest.drop(df_quest.groupby(['Categroy']).head(1).index, axis=0)
df_train.head()

Unnamed: 0,Categroy,Questions,cleaned_text,stopwords_removed,text_lemmatized
1,14 day trial,How can I try the offers?,how can i try the offers,try offers,try offer
2,14 day trial,How can I try the app?,how can i try the app,try app,try app
3,14 day trial,Can I try first?,can i try first,try first,try first
4,14 day trial,Can I try an offer before I buy?,can i try an offer before i buy,try offer buy,try offer buy
5,14 day trial,I want a free trial,i want a free trial,want free trial,want free trial


In [0]:
from sklearn.preprocessing import LabelEncoder
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df_train['labels'] = labelencoder.fit_transform(df_train['Categroy'])
df_train.head()

Unnamed: 0,Categroy,Questions,cleaned_text,stopwords_removed,text_lemmatized,labels
1,14 day trial,How can I try the offers?,how can i try the offers,try offers,try offer,0
2,14 day trial,How can I try the app?,how can i try the app,try app,try app,0
3,14 day trial,Can I try first?,can i try first,try first,try first,0
4,14 day trial,Can I try an offer before I buy?,can i try an offer before i buy,try offer buy,try offer buy,0
5,14 day trial,I want a free trial,i want a free trial,want free trial,want free trial,0


In [0]:
# Train & Test subsets
X_train, y_train = df_train.iloc[:, 2].values, df_train.iloc[:, 0].values.reshape(-1, 1)
X_test, y_test = df_test.iloc[:, 2].values, df_test.iloc[:, 0].values.reshape(-1, 1)

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer


# Transform each text into a vector of word counts
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=clean_text)

training_features = vectorizer.fit_transform(df_train['text_lemmatized'])    
test_features = vectorizer.transform(df_test['text_lemmatized'])

# Training
model = LinearSVC()
model.fit(training_features, df_train['labels'])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(df_test['labels'], y_pred)

print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

In [0]:
test_features = vectorizer.transform(df_chats['text_lemmatized'])
y_pred = model.predict(test_features)
y_pred

In [0]:
labelencoder.inverse_transform(y_pred)

In [0]:
df_chats['prediction']=labelencoder.inverse_transform(y_pred)

In [0]:
df_chats.head()

Unnamed: 0,ChatId,list_of_chats,lower_chats,summarized_text,stopwords_removed,most_freq_removed,cleaned_text,text_lemmatized,prediction
0,54137,Hi I spoke to Stella earlier regarding my rece...,hi i spoke to stella earlier regarding my rece...,hi i spoke to stella earlier regarding my rec...,spoke stella earlier regarding recent gourmet ...,spoke stella earlier regarding recent gourmet ...,spoke stella earlier regarding recent gourmet ...,speak stella earlier regard recent gourmet 202...,Pre Purchase
1,54141,Hello. This is my 3rd time on the chat I want ...,hello. this is my 3rd time on the chat i want ...,this is my 3rd time on the chat i want to cha...,3rd time chat want change package disconnected...,3rd time chat want change package disconnected...,3rd time chat want change package disconnected...,3rd time chat want change package disconnect e...,14 day trial
2,54145,Hi I was chatting with Greta How do I call you...,hi i was chatting with greta how do i call you...,hi i was chatting with greta how do i call yo...,chatting greta call might easier purchase 2021...,chatting greta call might easier purchase 2021...,chatting greta call might easier purchase 2021...,chat greta call might easier purchase 2021 bun...,Pre Purchase
3,54148,"Hello, I bought Entertainer dubai classic yest...","hello, i bought entertainer dubai classic yest...",jeeyeon an/ ajycom1120@gmail.com and my order...,jeeyeon ajycom1120gmailcom order number 101387...,jeeyeon an ajycom1120gmailcom order number 101...,jeeyeon an ajycom1120gmailcom order number 101...,jeeyeon ajycom1120gmailcom order number 101387...,Orders


In [0]:
df_chats['list_of_chats'].values[0] # Pre Purchase

In [0]:
df_chats['list_of_chats'].values[1] # 14 day trial

In [0]:
df_chats['list_of_chats'].values[2] # Pre Purchase

In [0]:
df_chats['list_of_chats'].values[3] # Order

In [0]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


# Transform each text into a vector of word counts
vectorizer = TfidfVectorizer(stop_words="english",
                             preprocessor=clean_text,
                             ngram_range=(1, 2))

training_features = vectorizer.fit_transform(df_train['text_lemmatized'])    
test_features = vectorizer.transform(df_test['text_lemmatized'])

# Training
model = LinearSVC()
model.fit(training_features, df_train["labels"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(df_test["labels"], y_pred)

print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

In [0]:
test_features = vectorizer.transform(df_chats['text_lemmatized'])
y_pred = model.predict(test_features)
y_pred

In [0]:
df_chats['prediction']=labelencoder.inverse_transform(y_pred)

In [0]:
df_chats.head()

Unnamed: 0,ChatId,list_of_chats,lower_chats,summarized_text,stopwords_removed,most_freq_removed,cleaned_text,text_lemmatized,prediction
0,54137,Hi I spoke to Stella earlier regarding my rece...,hi i spoke to stella earlier regarding my rece...,hi i spoke to stella earlier regarding my rec...,spoke stella earlier regarding recent gourmet ...,spoke stella earlier regarding recent gourmet ...,spoke stella earlier regarding recent gourmet ...,speak stella earlier regard recent gourmet 202...,Pre Purchase
1,54141,Hello. This is my 3rd time on the chat I want ...,hello. this is my 3rd time on the chat i want ...,this is my 3rd time on the chat i want to cha...,3rd time chat want change package disconnected...,3rd time chat want change package disconnected...,3rd time chat want change package disconnected...,3rd time chat want change package disconnect e...,14 day trial
2,54145,Hi I was chatting with Greta How do I call you...,hi i was chatting with greta how do i call you...,hi i was chatting with greta how do i call yo...,chatting greta call might easier purchase 2021...,chatting greta call might easier purchase 2021...,chatting greta call might easier purchase 2021...,chat greta call might easier purchase 2021 bun...,Pre Purchase
3,54148,"Hello, I bought Entertainer dubai classic yest...","hello, i bought entertainer dubai classic yest...",jeeyeon an/ ajycom1120@gmail.com and my order...,jeeyeon ajycom1120gmailcom order number 101387...,jeeyeon an ajycom1120gmailcom order number 101...,jeeyeon an ajycom1120gmailcom order number 101...,jeeyeon ajycom1120gmailcom order number 101387...,"Delivery Dubai, Abu Dhabi, Qatar"
