In [0]:
dbutils.library.installPyPI("fuzzywuzzy")
# dbutils.library.installPyPI("Keras")
# dbutils.library.installPyPI("nltk")
# dbutils.library.installPyPI("numpy")

In [0]:
! /databricks/python/bin/pip install --upgrade pip
! /databricks/python/bin/pip install nltk
! /databricks/python/bin/python -m nltk.downloader stopwords
! /databricks/python/bin/python -m nltk.downloader punkt
! /databricks/python/bin/python -m nltk.downloader wordnet

In [0]:
! /databricks/python/bin/pip install keras
! /databricks/python/bin/pip install tensorflow

In [0]:
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 
import nltk
from pyspark.sql.functions import *
from nltk.corpus import stopwords
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import re
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [0]:
def chat_summerizer(text):
  # Tokenizing the text 
  stopWords = set(stopwords.words("english")) 
  words = word_tokenize(text)
  sentences = sent_tokenize(text) 
  
  # Creating a frequency table to keep the score of each word 
  freqTable = dict() 
  for word in words: 
      word = word.lower() 
      if word in stopWords: 
          continue
      if word in freqTable: 
          freqTable[word] += 1
      else: 
          freqTable[word] = 1

  # Creating a dictionary to keep the score of each sentence 
  sentences = sent_tokenize(text) 
  sentenceValue = dict() 
   
  for sentence in sentences: 
      for word, freq in freqTable.items(): 
          if word in sentence.lower(): 
              if sentence in sentenceValue: 
                  sentenceValue[sentence] += freq 
              else: 
                  sentenceValue[sentence] = freq 
                  
  sumValues = 0
  for sentence in sentenceValue: 
      sumValues += sentenceValue[sentence] 
      
  # Average value of a sentence from the original text 
  average = int(sumValues / len(sentenceValue)) 
  
  # Storing sentences into our summary
  summary = '' 
  for sentence in sentences: 
      if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)): 
          summary += " " + sentence 
  # print(summary)
  return summary

In [0]:
def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text
  
def lemmatize_text(text):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]

def pre_processing(df,col_name):
  df['cleaned_text'] = df[col_name].apply(lambda x: (clean_text(x)))
  df['cleaned_text'] = df['cleaned_text'].str.replace('[^\w\s]','')
  stop = stopwords.words('english')
  df['stopwords_removed'] = df['cleaned_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
  df['text_lemmatized'] = df.stopwords_removed.apply(lemmatize_text)
  df['text_lemmatized']=df['text_lemmatized'].apply(lambda x: " ".join(a for a in x))
  # df['text_lemmatized'] = df['text_lemmatized'].str.replace('[^\w\s]','')
  return df

def prep_train_data():
  df_types = spark.table("irumdb.cs_types_list")
  df_types=df_types.filter(df_types.Questions.isNotNull())
  df_types=df_types.na.replace(['14 Day trial '], [None], 'Questions')
  df_types=df_types.filter(df_types.Questions.isNotNull())
  
  df_types_pd = df_types.toPandas()
  final_df = pre_processing(df_types_pd,col_name='Questions')
  return final_df
  
def prep_chat_data():
  df_chats= spark.table("live_admin_chat.chatmessages")
  df_chats_pd=df_chats.toPandas()
  
  # TAKING ONLY FIRST 100 CHATS FOR CHECKING THE CONCEPT
  new_df=df_chats_pd[["ChatId","MessageText","MessageBy"]][100:200].groupby(["ChatId","MessageBy"],as_index=False).aggregate(lambda x: list(x))
  new_df['messages_split']=new_df['MessageText'].apply(lambda x: " ".join(a for a in x))
  new_df=new_df[~(new_df.messages_split.str.contains('Hi, welcome to the ENTERTAINER Live Chat'))]
  new_df=new_df[~(new_df.messages_split.str.contains('Your patience is much appreciated'))]
  new_df=new_df[~(new_df.messages_split.str.contains('NAVIGATEURL'))]
  new_df.reset_index(drop=True,inplace = True)
  
  new_df_sp = spark.createDataFrame(new_df)
  new_df_sp = new_df_sp.select('ChatId','messages_split').groupby('ChatId').agg(collect_set('messages_split')).sort(['ChatId'],ascending=True)
  
  new_df_pd=new_df_sp.toPandas()
  new_df_pd.rename(columns={"collect_set(messages_split)":"list_of_chats"},inplace=True)
  new_df_pd['list_of_chats']=new_df_pd['list_of_chats'].apply(lambda x: " ".join(a for a in x))
  
  new_df_pd['lower_chats'] = new_df_pd['list_of_chats'].apply(lambda x: " ".join(x.lower() for x in x.split()))
  # FIRST TRY  
  # TEXT SUMMERIZER
  new_df_pd['summarized_text'] = new_df_pd['lower_chats'].apply(lambda x: (chat_summerizer(x)))
  
  stop = stopwords.words('english')
  # new_df_pd['stopwords_removed'] = new_df_pd['lower_chats'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
  new_df_pd['stopwords_removed'] = new_df_pd['summarized_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
  new_df_pd['stopwords_removed'] = new_df_pd['stopwords_removed'].str.replace('[^\w\s]','')
  freq=['nickchange','u','ok','okay','hi','thx','hello','entertainer','thanks','thankyou','thank you']
  new_df_pd['most_freq_removed'] = new_df_pd['stopwords_removed'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
  
  df_chats=pre_processing(new_df_pd,col_name='most_freq_removed')
  
#   # SECOND TRY TOTAL FLOP
#   # TEXT SUMMERIZER
#   df_chats['summarized_text'] = df_chats['most_freq_removed'].apply(lambda x: (chat_summerizer(x)))
  
  # df_chats=spark.createDataFrame(new_df_pd)
  # df_chats=df_chats.select('ChatId','most_freq_removed')
  return df_chats
  

In [0]:
df_quest = prep_train_data()
df_quest['cleaned_text'] = df_quest['cleaned_text'].str.replace('[^\w\s]','')
df_quest.head()

In [0]:
df_chats = prep_chat_data()
df_chats.head()

In [0]:
# Drop any rows with feature col empty
df_chats=df_chats.drop(df_chats.index[[2,3,5]])
df_chats.reset_index(drop=True,inplace = True)
df_chats.head()

In [0]:
# KEEP FIRST ENTRY FROM EACH GROUP FOR TESTING
df_test=df_quest.groupby('Categroy').first().reset_index()
df_test.head()

In [0]:
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df_test['labels'] = labelencoder.fit_transform(df_test['Categroy'])
df_test.head()

In [0]:
# DROP FIRST ENTRY FROM EACH GROUP AND KEEP THE REST FOR TRAINING
df_train=df_quest.drop(df_quest.groupby(['Categroy']).head(1).index, axis=0)
df_train.head()

In [0]:
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df_train['labels'] = labelencoder.fit_transform(df_train['Categroy'])
df_train.head()

In [0]:
# Train & Test subsets
X_train, y_train = df_train.iloc[:, 2].values, df_train.iloc[:, 0].values.reshape(-1, 1)
X_test, y_test = df_test.iloc[:, 2].values, df_test.iloc[:, 0].values.reshape(-1, 1)

In [0]:
# OHE the Categorical column
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers = [('one_hot_encoder', ohe(categories = 'auto'), [0])],
                       remainder = 'passthrough')

y_train = ct.fit_transform(y_train) #.todense()
y_test = ct.transform(y_test) #.todense()

In [0]:
# Setting some paramters
vocab_size = 2000
sequence_length = 100

In [0]:
# Tokenization with Keras
from keras.preprocessing.text import Tokenizer

tk = Tokenizer(num_words = vocab_size)
tk.fit_on_texts(X_train)

X_train = tk.texts_to_sequences(X_train)
X_test = tk.texts_to_sequences(X_test)

# Padding all questions with zeros
from keras.preprocessing.sequence import pad_sequences

X_train_seq = pad_sequences(X_train, maxlen = sequence_length, padding = 'post')
X_test_seq = pad_sequences(X_test, maxlen = sequence_length, padding = 'post')

In [0]:
# Training the Embedding Layer & the Neural Network
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten

model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 5, input_length = sequence_length))
model.add(Flatten())

model.add(Dense(units = 3, activation = 'softmax'))

model.compile(loss = 'categorical_crossentropy',
              optimizer = 'rmsprop',
              metrics = ['accuracy'])

model.summary()

history = model.fit(X_train_seq, y_train, epochs = 20, batch_size = 512, verbose = 1)

# Save model once done training
#model.save("model.h5")