# Libraries and Data

In [1]:
#downloading and installing needed libraries

import pandas as pd
import re #regular expressions
import nltk #natural language processing 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
nltk.download('wordnet') 
nltk.download('stopwords')
nltk.download('punkt')

#Data
texts = pd.read_csv("quality_for_ai_output.csv")

[nltk_data] Downloading package wordnet to /Users/Jakub/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Jakub/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/Jakub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Analysis 

In [3]:
#selecting only important variables of the dataset
texts = texts[["MSG_BODY","EMPATHY_RECODED"]]

#changing variable empathy_recoded into categorical variable
texts["EMPATHY_RECODED"] = texts["EMPATHY_RECODED"].astype("category")

#Droping all rows with missing observations (after we found there are NAs in EMPATHY_RECODED)
texts.dropna(inplace = True)
texts.reset_index(inplace = True,drop = True)
#texts = texts.dropna()
#counting number of observ. for each factor
print(texts["EMPATHY_RECODED"].value_counts())
#only 7 percent fails

#storing modified datafile
texts_or = texts.copy() #musi byt copy, lebo je to inak len pointer!!!

Pass    5419
Fail     405
Name: EMPATHY_RECODED, dtype: int64


# Text Preprocessing

In [4]:
texts_var = texts["MSG_BODY"].copy()

#reduce inflectional forms and sometimes derivationally related forms of a word to a common base form.
lemmatizer = WordNetLemmatizer()

for text in range(0,len(texts_var)):
    texts_var.iloc[text] = texts_var.iloc[text].lower()
    texts_var.iloc[text] = re.sub(r'[\n\xa0]','',str(texts_var.iloc[text])) #get rid of all \n and \xa0 
    #first is dear, then normally there's mr./mrs. or sthm like that and continues with thanking and possibly kiwi.com
    texts_var.iloc[text] = re.sub(r'dear[\w\s]*\.*[\w\s]*,*[\w\s]*[\.com]*\.*','',str(texts_var.iloc[text])) #look how to shorten the regex
    #last sentence (or potentially last two are always kind regards and name)
    texts_var.iloc[text] = re.sub(r'\.[\w\s,]*\.?[\w\s,]*travel consultant[\w\s\.]*','.',str(texts_var.iloc[text]))
    
    #Remove all the special characters  ... zatial nepouzivam lebo neviem co s bodkami
    #texts_var.iloc[text] = re.sub(r'\W', ' ', str(texts_var.iloc[text]))
    
    # remove all single characters
    texts_var.iloc[text] = re.sub(r'\s+[a-zA-Z]\s+', ' ', str(texts_var.iloc[text]))
    # Substituting multiple spaces with single space
    texts_var.iloc[text]= re.sub(r'\s+', ' ', str(texts_var.iloc[text]), flags=re.I)
    
    #najpv zrus emailove adresy, hyperlinky, kiwi.com, ()
    texts_var.iloc[text] = re.sub(r'\w*?.?\w*@','', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'https://.*manage/','', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'kiwi.com','', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'\(.*?\)','', str(texts_var.iloc[text]))
    
    #remove all remainig ",",".",etc.
    texts_var.iloc[text] = re.sub(r'([^\s\w]|_)+', ' ', str(texts_var.iloc[text]))
    
    #following is deletion of digits... currently not being used
    #texts_var.iloc[text] = re.sub(r'[\d]+', ' ', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'[\s\s]+', ' ', str(texts_var.iloc[text]))
    
    #randomly found problems - solved 
    #dividing of e-ticket
    texts_var.iloc[text] = re.sub(r'e ticket', 'eticket', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'e mail', 'email', str(texts_var.iloc[text]))
    
    #Following is tried to help the performance of final model
    #20 eur issue, etc.
    texts_var.iloc[text] = re.sub(r'20 eur', '20eur', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'20 euro', '20eur',str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'24 hours', '24hours', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'48 hours', '48hours', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'30 days', '30days', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'24 7', 'nonstop', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'5 star', '5star', str(texts_var.iloc[text]))
    texts_var.iloc[text] = re.sub(r'12 24hours', '12to24hours', str(texts_var.iloc[text])) #24hours bcs it is already modified above
    texts_var.iloc[text] = re.sub(r'36 hours', '36hours', str(texts_var.iloc[text]))
    
    #Stopwords (a, the, personal pronouns...) moc sa mi to nezda lebo sa tam stracaju nejake vyznamove prejenia
    stop_words = set(stopwords.words('english'))
    texts_var.iloc[text] = word_tokenize(texts_var.iloc[text]) 
    filtered_sentences = [w for w in texts_var.iloc[text] if not w in stop_words] 
    texts_var.iloc[text] = []
    for w in filtered_sentences: 
        if w not in stop_words: 
            texts_var.iloc[text].append(w) 
    
#lemmatizer gives totally bad results

In [6]:
# how many words have texts max
max_memory = 1
index = 1
lengths = [0] * len(texts_var)
for i in range(len(texts_var)):
    lengths[i] = len(texts_var[i])
    if lengths[i] > max_memory:
        max_memory = lengths[i]
        index = i
print("Max number of words is", max_memory)
print("The max number of words has word number", index)

pd.DataFrame(lengths).describe()

#deleting too long texts ... mostly texts in different language through automatic translations 

#saving dataframe of word lengths
lengths = pd.DataFrame(lengths)

#naming the only variable in the dataframe
lengths.columns = ['words']

#filtering only observations less than ...
indexes = lengths.loc[lengths['words'] < 300].index.values 

#updating texts_var
texts_or = texts_or.iloc[indexes,:]
texts_var = texts_var[indexes]

Max number of words is 904
The max number of words has word number 5297


# Extracting all fail empathy texts

In [1]:
file = open("fail_texts.txt","w")
fail_texts_indexes = texts_or[texts_or["EMPATHY_RECODED"]=="Fail"].index.values
for i in fail_texts_indexes:
    file.write(texts_or.iloc[i,0])
file.close() 


## Model

In [8]:
#Import libraries for tokenizing
import keras
from keras.preprocessing.text import Tokenizer

#assign numbers to each word from texts (max 5000)
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts_var)

#split the dataset into train and test
from sklearn.model_selection import train_test_split
train, test = train_test_split(texts_var, train_size = 0.7)

#tokenize train and test (lists are created)

X_train = tokenizer.texts_to_sequences(train)
X_test = tokenizer.texts_to_sequences(test)


vocab_size = len(tokenizer.word_index) + 1

#>>> print(sentences_train[2])
#>>> print(X_train[2])
#Of all the dishes, the salmon was the best, but all were great.
#[11, 43, 1, 171, 1, 283, 3, 1, 47, 26, 43, 24, 22]



# Exporting all the files used in further modelling

In [9]:
#import csv
#train.to_csv("train.csv",header = False,quoting = csv.QUOTE_NONNUMERIC)
#test.to_csv("test.csv",header = False,quoting = csv.QUOTE_NONNUMERIC)
#texts_or.to_csv("texts_or_shorter.csv",header = True,quoting = csv.QUOTE_NONNUMERIC)

train.to_pickle("train2.pkl")
test.to_pickle("test2.pkl")
texts_or.to_pickle("texts_or_shorter2.pkl")

In [None]:
texts_or.iloc[1295,:]

In [None]:
texts_var[1295]

In [8]:
#create labels to the test and train set
Y_train = texts_or["EMPATHY_RECODED"][train.index]

Y_test = texts_or["EMPATHY_RECODED"][test.index]
Y_test.head()

4810    Pass
4613    Fail
2478    Pass
5059    Pass
4137    Pass
Name: EMPATHY_RECODED, dtype: category
Categories (2, object): [Fail, Pass]

In [None]:
len(texts_or)

In [None]:
len(train)+len(test)

In [None]:
#Find out how distributed is empathy pass/fail in the train/test sample

#save train as dataframe
train = pd.DataFrame(train)

#store the original indexes from texts_or as a variable
train.reset_index(inplace = True)

#select only the indexes of train and count the #of observ. for each level
texts_or["EMPATHY_RECODED"][train.index].value_counts()


##the distribution is similar to the whole sample, analogically we tested the test file - same applies 

In [None]:
125/1856

In [9]:
#Setting maximum length and padding of tokenized texts (texts in form of numbers)
from keras.preprocessing.sequence import pad_sequences
maxlen = 100 #originally 300
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [None]:
#Help functions - tokenized corpus

#tokenizer.word_index
#print('{}: {}'.format('pocet', tokenizer.word_index["flight"]))

In [None]:
#Auxiliary functions

#checking how many sentences with thank you there are
x = 0
without_thankyou = []
for text in range(0,len(texts_var)):
    if re.search(r'thank you',str(texts_var.iloc[text])) != None :
        x = x+1
    else:
        if re.search(r'thanks',str(texts_var.iloc[text])) != None :
            x = x+1
        else:
            without_thankyou.append(text) 

print(len(without_thankyou)/len(texts_var))

In [None]:
storage_firstlines = []
#Check of all the first lines... whether the original regex is correct... seems ok
for text in range(0,len(texts_emphatic)):
    pattern = re.compile(r'[\w\s]*\.')
    storage_firstlines.append(pattern.search(texts_emphatic.iloc[text]))
    #search does not do what i wanted (showing the whole first lines), but serves the purpose (incorrect regex probably)
storage_firstlines

# Predictions

In [10]:
#We need Y_train and Y_test as binaries
#first create dictionary for Pass and FAil
convert = {"Pass":1, "Fail":0}
#now use method replace
Y_train.replace(convert,inplace = True)
Y_test.replace(convert,inplace = True)


In [11]:
from keras.models import Sequential
from keras import layers
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.callbacks import History 

input_dim = X_train.shape[1]  # Number of features

Y_test_bin = pd.DataFrame(to_categorical(Y_test))
Y_train_bin = pd.DataFrame(to_categorical(Y_train))

model = Sequential()
model.add(layers.Dense(900, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(300, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', 
               optimizer='adam', 
               metrics=['accuracy'])

model.summary()

history = model.fit(X_train, Y_train_bin,
                   batch_size = 10,
                   epochs=1000,
                   verbose=True,
                   validation_data=(X_test, Y_test_bin),
                   callbacks=[EarlyStopping(patience = 15, #restore_best_weights=True can be added
                                      monitor='acc')]
                   )








_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 900)               90900     
_________________________________________________________________
dense_2 (Dense)              (None, 300)               270300    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 602       
Total params: 361,802
Trainable params: 361,802
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 4014 samples, validate on 1721 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
 710/4014 [====>.........................] - ETA: 7s - loss: 0.99

KeyboardInterrupt: 

In [80]:
#Acc is around 93% which is however cause by unbalanced dataset --> all the targets are assigned 1s
#to check
#predict X_test once again
predictions = model.predict_classes(X_test)

#number of predicted observations
print(len(predictions))
#sum of predicted observations
print(sum(predictions))
#sum of original targets
print(sum(Y_test))

1748
0
1625


# Solving the problem of unbalanced dataset (with oversampling)

In [15]:
from keras.utils import to_categorical
Y_test_bin = pd.DataFrame(to_categorical(Y_test))
Y_train_bin = pd.DataFrame(to_categorical(Y_train))

In [16]:
#load function resample, which we use
from sklearn.utils import resample

# concatenate our training data back together
Y_train.reset_index(inplace = True,drop = True)
X_train_full = pd.DataFrame(X_train)
X_train_full["EMPATHY_RECODED"] = Y_train

#separate minority and majority classes
emp_pass = X_train_full[X_train_full.EMPATHY_RECODED == 1]
emp_fail = X_train_full[X_train_full.EMPATHY_RECODED == 0]

# upsample minority
emp_fail_upsampled = resample(emp_fail,
                          replace=True, # sample with replacement
                          n_samples=len(emp_pass), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([emp_pass, emp_fail_upsampled])

# check new class counts
print(upsampled.EMPATHY_RECODED.value_counts())

# Model

In [18]:
Y_train = pd.DataFrame(to_categorical(Y_train))

In [None]:
from keras.models import Sequential
from keras import layers
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.callbacks import History 

input_dim = X_train.shape[1]  # Number of features

Y_train = upsampled.EMPATHY_RECODED
X_train = upsampled.drop('EMPATHY_RECODED', axis=1)
Y_train = pd.DataFrame(to_categorical(Y_train))

model = Sequential()
model.add(layers.Dense(50000, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1000, input_dim=input_dim, activation='relu'))
#model.add(layers.Dense(50, input_dim=input_dim, activation='relu'))
#model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', 
               optimizer='adam', 
               metrics=['accuracy'])

model.summary()

history = model.fit(X_train, Y_train,
                   batch_size = 20,
                   epochs=1000,
                   verbose=True,
                   validation_data=(X_test, Y_test_bin),
                   callbacks=[EarlyStopping(patience = 5, #restore_best_weights=True can be added
                                      monitor='acc')]
                   )

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50000)             5050000   
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              50001000  
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 2002      
Total params: 55,053,002
Trainable params: 55,053,002
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 5668 samples, validate on 1322 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
 740/5668 [==>...........................] - ETA: 8:23 - loss: 7.8412 - acc: 0.5135

In [85]:
#Acc is around 93% which is however cause by unbalanced dataset --> all the targets are assigned 1s
#to check
#predict X_test once again
predictions2 = model.predict_classes(X_test)

#number of predicted observations
print(len(predictions2))
#sum of predicted observations
print(sum(predictions2))
#sum of original targets
print(sum(Y_test))

1374
1374
1264
