##### https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

In [36]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from spellchecker import SpellChecker
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import pandas as pd 
import nltk as nltk
import  xgboost, numpy, string
import datetime as dt
#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers

##### Functions

In [37]:
# Timer to check execution timing for each function call # 
def timer(start_time=None):
    if not start_time:
        start_time = dt.datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((dt.datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

# Spelling checker # 
def spell_correct(array):
    spell = SpellChecker()
    for i in range(len(array)):
        array[i] = spell.correction(array[i])
    return array
    
def stem(array):
    stemmer = nltk.PorterStemmer()
    return [stemmer.stem(w) for w in array]

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    
    # Fit the training dataset onto classifier #
    classifier.fit(feature_vector_train, label)
    
    # Predict the labels on validation dataset #
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

##### Data Prep 

In [38]:
# Data Prep# 
df = pd.read_csv("data/cleaned_hotelreviews.csv")

# Drop rows with null comments # 
df = df.dropna(how='any', axis=0)

# Make words case-insensitive # 
df = df.apply(lambda x: x.astype(str).str.lower())

# Remove punctuations if any # 
df["words_only"] = df['reviews'].str.replace('[^\w\s]','')

# Tokenization with NLTK # 
start_time = timer(None)
df['tokenized'] = df['words_only'].apply(nltk.word_tokenize)
print("\nTokenizer: ",end="")
timer(start_time)

# Spelling checker # : Replace incorrect words with correct words 
#start_time = timer(None)
#df['corrected'] = df['tokenized'].apply(spell_correct)
#print("\nSpelling Correction: ",end="")
#timer(start_time)

# Stemming with NLTK # 
start_time = timer(None)
df['stemmed'] = df['tokenized'].apply(stem)
print("\nStemming: ",end="")
timer(start_time)

# Turn arrays for each row in df['stemmed'] into a string #: Needed to run SkLearn Lib
df['stemmed'] = df['stemmed'].apply(" ".join)

# Train - Test Split # 
start_time = timer(None)
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['stemmed'], df['class'])
print("\nTrain-Test Split: ",end="")
timer(start_time)

# Label encode target variable to run ML models # 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


Tokenizer: 
 Time taken: 0 hours 1 minutes and 38.58 seconds.

Stemming: 
 Time taken: 0 hours 4 minutes and 45.6 seconds.

Train-Test Split: 
 Time taken: 0 hours 0 minutes and 0.24 seconds.


##### Count vectorisation : Create vectors as features 
    # Every row represents a review 
    # Every column represents a term from the corpus 
    # Every cell represents the frequency count of the particular term in the particular review 

In [39]:
start_time = timer(None)

# Create count vectoriser object # 
count_vector = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vector.fit(df['stemmed'])

# Transform training and validation data # 
xtrain_count = count_vector.transform(train_x)
xvalid_count = count_vector.transform(valid_x)

print("Count Vectors:", end="")
timer(start_time)

Count Vectors:
 Time taken: 0 hours 0 minutes and 23.82 seconds.


### Convert to TF-IDF Vectors 


In [40]:
start_time = timer(None)

# Word Level TF-IDF #: Matrix represents tf-idf scores of every term in each review 
tfidf_word = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_word.fit(df['stemmed'])
xtrain_tfidf_word = tfidf_word.transform(train_x)
xvalid_tfidf_word = tfidf_word.transform(valid_x)

print("Word Level TF-IDF", end="")
timer(start_time)

Word Level TF-IDF
 Time taken: 0 hours 0 minutes and 24.14 seconds.


In [41]:
start_time = timer(None)

# Unigram Level TF-IDF #: Matrix represents tf-idf scores of unigram (all terms are separate)
tfidf_unigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,1), max_features=5000)
tfidf_unigram.fit(df['stemmed'])
xtrain_tfidf_unigram = tfidf_unigram.transform(train_x)
xvalid_tfidf_unigram = tfidf_unigram.transform(valid_x)

# Bigram Level TF-IDF #: Terms are grouped together by twos 
tfidf_bigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=5000)
tfidf_bigram.fit(df['stemmed'])
xtrain_tfidf_bigram = tfidf_bigram.transform(train_x)
xvalid_tfidf_bigram = tfidf_bigram.transform(valid_x)

# Trigram Level TF-IDF #: Terms are grouped together in threes 
tfidf_trigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(3,3), max_features=5000)
tfidf_trigram.fit(df['stemmed'])
xtrain_tfidf_trigram = tfidf_trigram.transform(train_x)
xvalid_tfidf_trigram = tfidf_trigram.transform(valid_x)

print("Unigram, Bigram, Trigram TF-IDF:", end="")
timer(start_time)

Unigram, Bigram, Trigram TF-IDF:
 Time taken: 0 hours 1 minutes and 59.54 seconds.


In [42]:
start_time = timer(None)

# Character Level TF-IDF #: Matrix represents tf-idf scores of character level uni, bi & tri-gram of all reviews
tfidf_char = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(1,3), max_features=5000)
tfidf_char.fit(df['stemmed'])
xtrain_tfidf_char =  tfidf_char.transform(train_x) 
xvalid_tfidf_char =  tfidf_char.transform(valid_x) 

print("Character Level TF-IDF:", end="")
timer(start_time)

Character Level TF-IDF:
 Time taken: 0 hours 3 minutes and 7.4 seconds.


### LDA Model 

In [16]:
# Train LDA Model # 
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vector.get_feature_names()

# View Topic Models # 
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

topic_summaries

['a bit expens date it but worth imperson is bad',
 'facil reason attract accompani via more approach immacul extend defin',
 'beauti hotel and a with staff veri wonder friendli servic',
 'a in extrem park locat great far area help kept',
 'control temperatur condit arrog effort unwelcom idea climb wors shout',
 'understand bang stabl picadelli stylish citi worh heigh pr explor',
 'bit expens a with no valu should have includ price',
 'oak panel loung relax beauti a in bathroom linen bath',
 'dark sm each greenwich thame gbp kettl head luggag advantag',
 'plug couldnt pm housekeep joke suppli home a to move',
 'properli due doesn tricki sheet thick refil unless privaci characterist',
 'regular updat smartest key basi tuck thoroughli obvious hustl bustl',
 'vou ca deplacez va un voitur peu vill si en',
 'onsit a nice in would and better have been bath',
 'complaint newspap languag post give valu bad a english experi',
 'were queue promptli clinic everyon sever comput western fruit repla

In [17]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    
    # Fit the training dataset onto classifier #
    classifier.fit(feature_vector_train, label)
    
    # Predict the labels on validation dataset #
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

### NAIVE BAYES 

In [None]:
# Assumes indepedence among predictors # 
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Unigram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram)
print ("NB, Uni-Gram Vectors: ", accuracy)

# Naive Bayes on Bigram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram)
print ("NB, Bi-Gram Vectors: ", accuracy)

# Naive Bayes on Trigram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_trigram, train_y, xvalid_tfidf_trigram)
print ("NB, Tri-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_char, train_y, xvalid_tfidf_char)
print ("NB, CharLevel Vectors: ", accuracy)

## Linear Classifier (Logistic Regression) 

In [None]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Unigram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram)
print ("LR, Uni-Gram Vectors: ", accuracy)

# Linear Classifier on Bigram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram)
print ("LR, Bi-Gram Vectors: ", accuracy)

# Linear Classifier on Trigram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_trigram, train_y, xvalid_tfidf_trigram)
print ("LR, Tri-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_char, train_y, xvalid_tfidf_char)
print ("LR, CharLevel Vectors: ", accuracy)

## SVM Model

In [None]:
# SVM on Unigram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram)
print ("SVM, Uni-Gram Vectors: ", accuracy)

In [20]:
# Supervised ML Algo that extracts best possible hyper-plane/ line that segregates the two classes #

# SVM on Word Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("SVM, Word Level Vectors: ", accuracy)

# SVM on Unigram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram)
print ("SVM, Uni-Gram Vectors: ", accuracy)

# SVM on Bigram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram)
print ("SVM, Bi-Gram Vectors: ", accuracy)

# SVM on Trigram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_trigram, train_y, xvalid_tfidf_trigram)
print ("SVM, Tri-Gram Vectors: ", accuracy)
       
# SVM on Char Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_char, train_y, xvalid_tfidf_char)
print ("SVM, CharLevel Vectors: ", accuracy)



SVM, Word Level Vectors:  0.816
SVM, Uni-Gram Vectors:  0.816
SVM, Bi-Gram Vectors:  0.816
SVM, Tri-Gram Vectors:  0.816
SVM, CharLevel Vectors:  0.816


## Random Forest (Bagging)

In [None]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("RF, Word Level Vectors: ", accuracy)

# RF on Unigram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram)
print ("RF, Uni-gram TF-IDF: ", accuracy)

# RF on Bigram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram)
print ("RF, Bi-gram TF-IDF: ", accuracy)

# RF on Trigram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_trigram, train_y, xvalid_tfidf_trigram)
print ("RF, Tri-gram TF-IDF: ", accuracy)

# RF on Char Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_char, train_y, xvalid_tfidf_char)
print ("RF, CharLevel TF-IDF: ", accuracy)

## XGBoost (Boosting)

In [43]:
# Ran this with full data due to resource constraint :)))))))) #
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

Xgb, Count Vectors:  0.9287622226378359


In [None]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_word.tocsc(), train_y, xvalid_tfidf_word.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Unigram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_unigram.tocsc(), train_y, xvalid_tfidf_unigram.tocsc())
print ("Xgb, Unigram TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Bigram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_bigram.tocsc(), train_y, xvalid_tfidf_bigram.tocsc())
print ("Xgb, Bigram TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Trigram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_trigram.tocsc(), train_y, xvalid_tfidf_trigram.tocsc())
print ("Xgb, Trigram TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_char.tocsc(), train_y, xvalid_tfidf_char.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)

##  Feature Importance XGB 
###### Can't plot important features if we only have one train column 

In [None]:
# Additional code for future reference # 
# Plot graph showing importance features, max = 50 features # 
importance = xgboost.XGBClassifier().feature_importances_ 
importance = pd.Series(importance, index=xtrain_count.columns)
importance.nlargest(50).plot(kind='barh')

# Select important features #
importance.sort_values(axis=0,ascending=False, inplace=True)
selected_features = importance.index[0:30].tolist()

In [44]:
# XGB Model # 
model = xgboost.XGBClassifier(max_depth=7,
                           min_child_weight=1,
                           learning_rate=0.2,
                           n_estimators=500,
                           silent=True,
                           objective='binary:logistic',
                           gamma=0,
                           max_delta_step=0,
                           subsample=1,
                           colsample_bytree=1,
                           colsample_bylevel=1,
                           reg_alpha=0,
                           reg_lambda=0,
                           scale_pos_weight=1,
                           seed=1,
                           missing=None,
                           tree_method='exact',
                           nthread=4)

# Params for hyperparameter grid search # 
params = {
        'max_depth': [5,7],
        'min_child_weight': [1, 5],
        'gamma': [0.5, 1]
        }

## Hyperparameter Grid Search with 3 Fold Validation

In [None]:
start_time = timer(None)

# 3 fold validation with hyperparameter grid search #
folds = 3
param_comb = 3

kf = KFold(n_splits=folds, shuffle = False, random_state = None)

grid = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', n_jobs=3, cv=kf.split(xtrain_count,train_y), 
                    verbose=3 )

start_time = timer(None)

grid.fit(xtrain_count,train_y)

print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_)
print('\n Best parameters:')
print(grid.best_params_)

timer(start_time)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
