
## **Transformer-based models Implementation**
โมเดลหลักที่ใช้ในงานวิจัย\
Bert, ALBERT, RoBERTa, TF-IDF



#**Import Libraries**

In [None]:
# require GPU to run transformer model
import matplotlib
print(matplotlib.__version__)
!pip -q install torch==1.5.0 torchtext==0.4.0 torchvision==0.6.0
!pip -q install transformers==3.5.0

from transformers import (AutoTokenizer, AutoModel, pipeline, AutoModelForSequenceClassification)

In [None]:
# import necessary libraries
import pandas as pd
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
import numpy as np
import tensorflow as tf

# SET PATH TO DATA FOLDER
path= "/content/drive/My Drive/Colab Notebooks/NLP_ITM/Research/"

import torch
# check GPU available?
torch.cuda.is_available()

In [None]:
# import the necessary libraries for dataset preparation, feature engineering, model training
from sklearn import model_selection, preprocessing, metrics, linear_model, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from imblearn.over_sampling import BorderlineSMOTE, SMOTE, ADASYN, SMOTENC, RandomOverSampler
from imblearn.under_sampling import (RandomUnderSampler, 
                                    NearMiss, 
                                    InstanceHardnessThreshold,
                                    CondensedNearestNeighbour,
                                    EditedNearestNeighbours,
                                    RepeatedEditedNearestNeighbours,
                                    AllKNN,
                                    NeighbourhoodCleaningRule,
                                    OneSidedSelection,
                                    TomekLinks)
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import make_pipeline
import pandas as pd, numpy, string
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer

# Remove Special Charactors
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from sklearn.ensemble import RandomForestClassifier

# import dataframe libraries
!pip install pyspark
!pip install koalas
import databricks.koalas as ks
from pyspark.sql import SparkSession
import seaborn as sns

# **Data Visualization**

In [None]:
ks_df = ks.from_pandas(pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP_ITM/Research/ratings_and_sentiments UTF-8.csv', encoding = 'utf8'))
ks_df.head()

In [None]:
# Word cloud Visualization
import matplotlib.pyplot as plt
import re
import gc
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
from gensim.models import KeyedVectors
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers, losses, optimizers

In [2]:
#Text preprocessing - Data cleaning
def text_preprocessing(text, for_vec_models=False):
    if for_vec_models:
        text = text.lower()
        text = re.sub('[^a-z]+', ' ', text)
        text = text.strip()
    else:
        text = text.lower()
        text = re.sub('[^a-z]+', ' ', text)
        text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))
        text = ' '.join(PorterStemmer().stem(word) for word in text.split())
        text = text.strip()
    return text

In [None]:
#remove stop words and process text
import nltk
nltk.download('stopwords')

texts = ks_df.review_text.apply(text_preprocessing)

#Add text clean into new dataset

texts_new = []
for t in ks_df.review_text:
    texts_new.append(text_preprocessing(t))

ks_df['text_clean'] = texts_new
ks_df.head()

In [None]:
text_len = []
for text in ks_df.text_clean:
    review_text_len = len(text.split())
    text_len.append(review_text_len)

# announce text len
ks_df['text_len'] = text_len

In [None]:
# word cloud and word len plotting
# High Sentiment class

plt.figure(figsize=(14,7))
sns.histplot(ks_df[ks_df["cat_rating"]=="HIGH"]["text_len"],color="salmon")
plt.title("Distribution of Review text length for HIGH")
display(ks_df.text_len[ks_df["cat_rating"]=="HIGH"].describe())


from wordcloud import WordCloud
plt.figure(figsize=(20,20))
wc = WordCloud(max_words=2000,min_font_size=10, height=800,width=1600,
               background_color="white").generate(" ".join(ks_df[ks_df["cat_rating"]=="HIGH"].text_clean))
plt.imshow(wc)



In [None]:
# word cloud and word len plotting
# Low Sentiment class

plt.figure(figsize=(14,7))
sns.histplot(ks_df[ks_df["cat_rating"]=="LOW"]["text_len"],color="salmon")
plt.title("Distribution of Review text length for LOW")
display(ks_df.text_len[ks_df["cat_rating"]=="LOW"].describe())

from wordcloud import WordCloud
plt.figure(figsize=(20,20))
wc = WordCloud(max_words=2000,min_font_size=10, height=800,width=1600,
               background_color="white").generate(" ".join(ks_df[ks_df["cat_rating"]=="LOW"].text_clean))
plt.imshow(wc)

In [None]:
# word cloud and word len plotting
# Overall dataset

from wordcloud import WordCloud
plt.figure(figsize=(20,20))
wc = WordCloud(max_words=2000,min_font_size=10, height=800,width=1600,
               background_color="white").generate(" ".join(ks_df.text_clean))
plt.imshow(wc)

#**Data Preparation**

In [None]:
# import necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

# load the data set
data = pd.read_csv(path + '/ratings_and_sentiments UTF-8.csv')
  
# print info about columns in the dataframe
print(data.info())

In [None]:
#Class measurement
# High and Low classes

data['bool_HIGH'].value_counts().plot.bar()

# 1-5 star classes
data['num_rating'].value_counts().plot.bar()

In [None]:
#Imbalance data handling techniques
#Using Random Over Sampling 

data_zip = list(zip(data['review_text'], data['coffee_shop_name']))
ros = RandomOverSampler(random_state=0, sampling_strategy=)
ros_x, ros_y = ros.fit_resample(data_zip,data['bool_HIGH'])

ros_data2 = [row[0] for row in ros_x]

ros_data = pd.DataFrame(list(zip(ros_data2, ros_y)),
               columns =['review_text', 'bool_HIGH'])

#Classes measurement checking
ros_data['bool_HIGH'].value_counts().plot.bar()
print(ros_data.info())

#**Preprocessing and Feature Extraction**

## **BERT**

In [None]:
#Import BERT model and tokenizer

from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch
import gc

device = torch.device("cuda")
torch.set_default_tensor_type('torch.cuda.FloatTensor')

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained("bert-base-multilingual-cased", output_hidden_states=True)
bert_model = bert_model.to(device)

In [None]:
a = torch.Tensor([102, 101]).long()
c = torch.Tensor([1] * 2).long()
def adjust_encoded_input(encoded_input):

  # delete first and last separator token and splits to 510 tokens
  input_ids_chunks = list(encoded_input['input_ids'][0][1:-1].split(510))
  attention_mask_chunks = list(encoded_input['attention_mask'][0][1:-1].split(510))

  for i in range(len(input_ids_chunks)):

    # add 101 to the first and 102 to last element tonsor padding len to 512 for transformer model 
    input_ids_chunks[i] = torch.cat([input_ids_chunks[i], a ])
    # shifting 101 102
    input_ids_chunks[i] = torch.roll(input_ids_chunks[i], 1, 0)
    
    # padding len to 512 for transformer model
    pad_len = 512 - input_ids_chunks[i].shape[0]
    b = torch.Tensor([0] * pad_len).long()

    input_ids_chunks[i] = torch.cat([input_ids_chunks[i], b])

    if len(attention_mask_chunks[i]) == 510:
      attention_mask_chunks[i] = torch.cat([attention_mask_chunks[i], c])
    else:
      d = torch.Tensor([0] * (pad_len)).long()
      attention_mask_chunks[i] = torch.cat([attention_mask_chunks[i], c])
      attention_mask_chunks[i] = torch.cat([attention_mask_chunks[i], d])

  input_ids = torch.stack(input_ids_chunks)
  attention_mask = torch.stack(attention_mask_chunks)

  input_dict = {
      'input_ids': input_ids.long(),
      'attention_mask': attention_mask.int()
  }

  return input_dict

Tokens will be converted to tensor then appllied padding method to fill every tensor to 512 words

In [None]:
#Padding sample

text = "11/13/2016 Beautiful eccentric coffee shop with a library of peculiar books.  Swings, couches, and pillow corners for cuddle puddles.  Coffee with mint and ways you've never imagined coffee could be made. Try the matcha green tea with soy, creamiest matcha I've ever had.  First time here and already my favorite coffee bar so far. See all photos from Vicki Y. for The Factory - Cafe With a Soul"

encoded_input = bert_tokenizer(text, return_tensors='pt').to(device)
encoded_input = adjust_encoded_input(encoded_input)
encoded_input['input_ids'], encoded_input['input_ids'].size()

In [None]:
import copy

def extract_last_four_with_bert(input_text, feature_extractor):
  encoded_input = bert_tokenizer(input_text, return_tensors='pt').to(device)
  encoded_input = adjust_encoded_input(encoded_input)
  # hidden_states = feature_extractor(**encoded_input)[0]
  _, _, hidden_states = feature_extractor(**encoded_input)

  # FOR MEAN CALCULATION BETWEEN TENSOR DIMENSION

  last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]

  cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
  cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()

  if cat_sentence_embedding.shape[0] != 3072:

    doc_embedding = torch.sum(cat_sentence_embedding, dim=0)
  else:
    doc_embedding = copy.copy(cat_sentence_embedding)

  return doc_embedding.cpu().detach().numpy().astype('float64')

In [None]:
text = "11/13/2016 Beautiful eccentric coffee shop with a library of peculiar books.  Swings, couches, and pillow corners for cuddle puddles.  Coffee with mint and ways you've never imagined coffee could be made. Try the matcha green tea with soy, creamiest matcha I've ever had.  First time here and already my favorite coffee bar so far. See all photos from Vicki Y. for The Factory - Cafe With a Soul"

t_2 = extract_last_four_with_bert(text, bert_model)
t_2

In [None]:
 len(t_2)

## **RoBERTa**

In [None]:
#Import Roberta model and tokenizer
#active GPU

device = torch.device("cuda")
torch.set_default_tensor_type('torch.cuda.FloatTensor')

from transformers import RobertaTokenizer, RobertaModel
import torch
import gc
Roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
Roberta_model = RobertaModel.from_pretrained("roberta-base", output_hidden_states=True)
Roberta_model = Roberta_model.to(device)

In [None]:
a = torch.Tensor([102, 101]).long()
c = torch.Tensor([1] * 2).long()
def adjust_encoded_input_roberta(encoded_input):

  # delete first and last separator token and splits to 510 tokens
  input_ids_chunks = list(encoded_input['input_ids'][0][1:-1].split(510))
  attention_mask_chunks = list(encoded_input['attention_mask'][0][1:-1].split(510))

  for i in range(len(input_ids_chunks)):

    # add 101 to the first and 102 to last element tonsor padding len to 512 for transformer model 
    input_ids_chunks[i] = torch.cat([input_ids_chunks[i], a ])
    # shifting 101 102
    input_ids_chunks[i] = torch.roll(input_ids_chunks[i], 1, 0)
    
    # padding len to 512 for transformer model
    pad_len = 512 - input_ids_chunks[i].shape[0]
    b = torch.Tensor([0] * pad_len).long()

    input_ids_chunks[i] = torch.cat([input_ids_chunks[i], b])

    if len(attention_mask_chunks[i]) == 510:
      attention_mask_chunks[i] = torch.cat([attention_mask_chunks[i], c])
    else:
      d = torch.Tensor([0] * (pad_len)).long()
      attention_mask_chunks[i] = torch.cat([attention_mask_chunks[i], c])
      attention_mask_chunks[i] = torch.cat([attention_mask_chunks[i], d])

  input_ids = torch.stack(input_ids_chunks)
  attention_mask = torch.stack(attention_mask_chunks)

  input_dict = {
      'input_ids': input_ids.long(),
      'attention_mask': attention_mask.int()
  }

  return input_dict

In [None]:
text = "11/13/2016 Beautiful eccentric coffee shop with a library of peculiar books.  Swings, couches, and pillow corners for cuddle puddles.  Coffee with mint and ways you've never imagined coffee could be made. Try the matcha green tea with soy, creamiest matcha I've ever had.  First time here and already my favorite coffee bar so far. See all photos from Vicki Y. for The Factory - Cafe With a Soul"

encoded_input = Roberta_tokenizer(text, return_tensors='pt').to(device)
encoded_input = adjust_encoded_input_roberta(encoded_input)
encoded_input['input_ids'], encoded_input['input_ids'].size()

In [None]:
import copy

def extract_last_four_with_roberta(input_text, feature_extractor):

  encoded_input = Roberta_tokenizer(input_text, return_tensors='pt').to(device)
  encoded_input = adjust_encoded_input_roberta(encoded_input)
  _, _, hidden_states = feature_extractor(**encoded_input)

  # use only last 4 layers 
  last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]

  # concat last 4 layers vectors then calculate mean between vectors
  cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
  cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()

  # if document only has 1 batch, no need to sum vector
  if cat_sentence_embedding.shape[0] != 3072:

    doc_embedding = torch.sum(cat_sentence_embedding, dim=0)
  else:
    doc_embedding = copy.copy(cat_sentence_embedding)
  
  return doc_embedding.cpu().detach().numpy().astype('float64')

In [None]:
text = "11/13/2016 Beautiful eccentric coffee shop with a library of peculiar books.  Swings, couches, and pillow corners for cuddle puddles.  Coffee with mint and ways you've never imagined coffee could be made. Try the matcha green tea with soy, creamiest matcha I've ever had.  First time here and already my favorite coffee bar so far. See all photos from Vicki Y. for The Factory - Cafe With a Soul"

t_2 = extract_last_four_with_roberta(text, Roberta_model)
t_2

In [None]:
len(t_2)

## **ALBERT**

In [None]:
# Import ALBERT model and tokenizer
# active GPU
device = torch.device("cuda")
torch.set_default_tensor_type('torch.cuda.FloatTensor')

from transformers import AlbertTokenizer, AlbertModel, AlbertConfig
import torch
import gc
albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
albert_model = AlbertModel.from_pretrained('albert-base-v2', output_hidden_states=True)
albert_model = albert_model.to(device)

In [None]:
a = torch.Tensor([102, 101]).long()
c = torch.Tensor([1] * 2).long()
def adjust_encoded_input_albert(encoded_input):

  # delete first and last separator token and splits to 510 tokens
  input_ids_chunks = list(encoded_input['input_ids'][0][1:-1].split(510))
  attention_mask_chunks = list(encoded_input['attention_mask'][0][1:-1].split(510))

  for i in range(len(input_ids_chunks)):

    # add 101 to the first and 102 to last element tonsor padding len to 512 for transformer model 
    input_ids_chunks[i] = torch.cat([input_ids_chunks[i], a ])
    # shifting 101 102
    input_ids_chunks[i] = torch.roll(input_ids_chunks[i], 1, 0)
    
    # padding len to 512 for transformer model
    pad_len = 512 - input_ids_chunks[i].shape[0]
    b = torch.Tensor([0] * pad_len).long()

    input_ids_chunks[i] = torch.cat([input_ids_chunks[i], b])

    if len(attention_mask_chunks[i]) == 510:
      attention_mask_chunks[i] = torch.cat([attention_mask_chunks[i], c])
    else:
      d = torch.Tensor([0] * (pad_len)).long()
      attention_mask_chunks[i] = torch.cat([attention_mask_chunks[i], c])
      attention_mask_chunks[i] = torch.cat([attention_mask_chunks[i], d])

  input_ids = torch.stack(input_ids_chunks)
  attention_mask = torch.stack(attention_mask_chunks)

  input_dict = {
      'input_ids': input_ids.long(),
      'attention_mask': attention_mask.int()
  }

  return input_dict

In [None]:
text = "11/13/2016 Beautiful eccentric coffee shop with a library of peculiar books.  Swings, couches, and pillow corners for cuddle puddles.  Coffee with mint and ways you've never imagined coffee could be made. Try the matcha green tea with soy, creamiest matcha I've ever had.  First time here and already my favorite coffee bar so far. See all photos from Vicki Y. for The Factory - Cafe With a Soul"

encoded_input = albert_tokenizer(text, return_tensors='pt').to(device)
encoded_input = adjust_encoded_input_albert(encoded_input)
encoded_input['input_ids'], encoded_input['input_ids'].size()

In [None]:
import copy

def extract_last_four_with_albert(input_text, feature_extractor):

  encoded_input = albert_tokenizer(input_text, return_tensors='pt').to(device)
  encoded_input = adjust_encoded_input_albert(encoded_input)
  _, _, hidden_states = feature_extractor(**encoded_input)

  # use only last 4 layers 
  last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]

  # concat last 4 layers vectors then calculate mean between vectors
  cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
  cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()

  # if document only has 1 batch, no need to sum vector
  if cat_sentence_embedding.shape[0] != 3072:

    doc_embedding = torch.sum(cat_sentence_embedding, dim=0)
  else:
    doc_embedding = copy.copy(cat_sentence_embedding)
  
  return doc_embedding.cpu().detach().numpy().astype('float64')

In [None]:
text = "11/13/2016 Beautiful eccentric coffee shop with a library of peculiar books.  Swings, couches, and pillow corners for cuddle puddles.  Coffee with mint and ways you've never imagined coffee could be made. Try the matcha green tea with soy, creamiest matcha I've ever had.  First time here and already my favorite coffee bar so far. See all photos from Vicki Y. for The Factory - Cafe With a Soul"

t_2 = extract_last_four_with_albert(text, albert_model)
t_2

Announce new Dataframe

In [None]:
ks_da_df = pd.read_csv(path + '/ratings_and_sentiments UTF-8.csv')
#ks_df = pd.read_csv(path + '/sentiments_by_shop.csv')

# **Model output extraction** - Extract last four layers

In [None]:
albert_vectors = []

bert_vectors = []

roberta_vectors = []


#df_pos = ks_da_df[ks_da_df['bool_HIGH']]



for idx, row in ros_data.iterrows():

  text = row['review_text']


  albert_vector = extract_last_four_with_albert(text, albert_model)

  bert_vector = extract_last_four_with_bert(text, bert_model)

  roberta_vector = extract_last_four_with_roberta(text, Roberta_model)

  albert_vectors.append(albert_vector)

  bert_vectors.append(bert_vector)

  roberta_vectors.append(roberta_vector)

  print(idx)


ros_data['content_bert_vector'] = bert_vectors

ros_data['content_albert_vector'] = albert_vectors

ros_data['content_roberta_vector'] = roberta_vectors

ros_data.to_pickle("/content/drive/MyDrive/Colab Notebooks/NLP_ITM/Research/ratings_and_sentiments UTF-8.pkl")

Extrac last four layers then saved into pickle file to save trained time and be ready to use

# **Model Evaluation**

# Import saved file from pickle

In [None]:
import pickle

ros_data = pickle.load( open( "/content/drive/MyDrive/Colab Notebooks/NLP_ITM/Research/ratings_and_sentiments UTF-8.pkl", "rb"))

# **Logistic Regression**




In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
ks_selected_df = ros_data

# train test spilt 80/20 ratio
ks_df_train, ks_df_test, ks_df_y_train, ks_df_y_test = train_test_split(ks_selected_df, list(ks_selected_df['bool_HIGH']), test_size=0.2, random_state=0)

task = {
    "ks_df": {
        'data': ks_df_train,
        'col': 'content_bert_vector',
        'language_model' : 'BERT (LR)'
    },
    "kh_df": {
        'data': ks_df_train,
        'col': 'content_albert_vector',
        'language_model' : 'ALBERT (LR)'
    },
   "ka_df": {
        'data': ks_df_train,
        'col': 'content_roberta_vector',
        'language_model' : 'RoBERTa (LR)'
         },
}

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report

is_manual = True

# loop data df
for i in task:

  col = task[i]['col']
  X = list(task[i]['data'][col])
  y = list(task[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:

    logreg_model = LogisticRegression(max_iter=max_num_iter, random_state=0,multi_class='multinomial')
    
    logreg_model.fit(X, y)
    y_pred = logreg_model.predict(X_test)
    print(task[i]['language_model'])
    print(classification_report(y_pred, y_test, digits = 4))

    print('\n')

## **Logistic Regression HPOs**

Grid search - LR

In [None]:
LR_parameters ={
    'C': [10,20,30]} 

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import GridSearchCV

is_manual = True

# loop data df
for i in task:

  col = task[i]['col']
  X = list(task[i]['data'][col])
  y = list(task[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf = LogisticRegression()
    model = GridSearchCV(clf, param_grid=LR_parameters, cv=4, scoring='accuracy',error_score=0, n_jobs=-1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))

In [None]:
model.best_params_

Random Search - LR

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import RandomizedSearchCV

is_manual = True

# loop data df
for i in task:

  col = task[i]['col']
  X = list(task[i]['data'][col])
  y = list(task[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf = LogisticRegression()
    model = RandomizedSearchCV(clf, LR_parameters, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))

In [None]:
model.best_params_

# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
ks_selected_df = ros_data

# train test spilt 80/20 ratio
ks_df_train, ks_df_test, ks_df_y_train, ks_df_y_test = train_test_split(ks_selected_df, list(ks_selected_df['bool_HIGH']), test_size=0.2, random_state=0)

task2 = {
    "ks_df": {
        'data': ks_df_train,
        'col': 'content_bert_vector',
        'language_model' : 'BERT (RF)'
    },
    "kh_df": {
        'data': ks_df_train,
        'col': 'content_albert_vector',
        'language_model' : 'ALBERT (RF)'
    },
   "ka_df": {
        'data': ks_df_train,
        'col': 'content_roberta_vector',
        'language_model' : 'RoBERTa (RF)'
         },
}

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import GridSearchCV

is_manual = True

# loop data df
for i in task2:

  col = task2[i]['col']
  X = list(task2[i]['data'][col])
  y = list(task2[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    RF_model = RandomForestClassifier()
    RF_model.fit(X, y)
    y_pred = RF_model.predict(X_test)
    print(task2[i]['language_model'])
    print(classification_report(y_pred, y_test, digits = 4))

    print('\n')

## **Random Forest HPOs**

Grid search - RF

In [None]:
RF_parameters ={
    'max_depth': [15,25],
    'min_samples_split': [5,10],
    'n_estimators': [200]}

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import GridSearchCV

is_manual = True

# loop data df
for i in task2:

  col = task2[i]['col']
  X = list(task2[i]['data'][col])
  y = list(task2[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf = RandomForestClassifier()
    model = GridSearchCV(clf, RF_parameters, cv=4, scoring='accuracy',n_jobs=-1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))


In [None]:
model.best_params_

Random search - RF

In [None]:
RF_random ={
    'max_depth': [35,45],
    'min_samples_split': [5],
    'n_estimators': [200]}

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import RandomizedSearchCV

is_manual = True

# loop data df
for i in task2:

  col = task2[i]['col']
  X = list(task2[i]['data'][col])
  y = list(task2[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf = RandomForestClassifier()
    model = RandomizedSearchCV(estimator = clf, param_distributions = RF_random, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))
    

In [None]:
model.best_params_

# **Support Vector Machine**

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
ks_selected_df = ros_data

# train test spilt 80/20 ratio
ks_df_train, ks_df_test, ks_df_y_train, ks_df_y_test = train_test_split(ks_selected_df, list(ks_selected_df['bool_HIGH']), test_size=0.2, random_state=0)

task3 = {
    "ks_df": {
        'data': ks_df_train,
        'col': 'content_bert_vector',
        'language_model' : 'BERT (SVM)'
    },
    "kh_df": {
        'data': ks_df_train,
        'col': 'content_albert_vector',
        'language_model' : 'ALBERT (SVM)'
    },
   "ka_df": {
        'data': ks_df_train,
        'col': 'content_roberta_vector',
        'language_model' : 'RoBERTa (SVM)'
         },
}

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report

is_manual = True

# loop data df
for i in task3:

  col = task3[i]['col']
  X = list(task3[i]['data'][col])
  y = list(task3[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:

    svm_model = svm.LinearSVC()
    svm_model.fit(X, y)
    y_pred = svm_model.predict(X_test)
    print(task3[i]['language_model'])
    print(classification_report(y_pred, y_test, digits = 4))

    print('\n')

## **Support Vector Machine HPOs**

Grid search - SVM

In [None]:
SVM_grid ={
    'C' : [0.01, 0.1, 1]}

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import GridSearchCV

is_manual = True

# loop data df
for i in task3:

  col = task3[i]['col']
  X = list(task3[i]['data'][col])
  y = list(task3[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf =svm.LinearSVC()
    model = GridSearchCV(clf, SVM_grid, cv=4, scoring='accuracy',n_jobs=-1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))

In [None]:
model.best_params_

Random search - SVM

In [None]:
SVM_random ={
    'C' : [0.01, 0.1, 1]}

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import GridSearchCV

is_manual = True

# loop data df
for i in task3:

  col = task3[i]['col']
  X = list(task3[i]['data'][col])
  y = list(task3[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf =svm.LinearSVC()
    model = model = RandomizedSearchCV(estimator = clf, param_distributions = SVM_random, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))

In [None]:
model.best_params_

# **Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
ks_selected_df = ros_data

# train test spilt 80/20 ratio
ks_df_train, ks_df_test, ks_df_y_train, ks_df_y_test = train_test_split(ks_selected_df, list(ks_selected_df['bool_HIGH']), test_size=0.2, random_state=0)

task4 = {
    "ks_df": {
        'data': ks_df_train,
        'col': 'content_bert_vector',
        'language_model' : 'BERT (NB)'
    },
    "kh_df": {
        'data': ks_df_train,
        'col': 'content_albert_vector',
        'language_model' : 'ALBERT (NB)'
    },
   "ka_df": {
        'data': ks_df_train,
        'col': 'content_roberta_vector',
        'language_model' : 'RoBERTa (NB)'
         },
}

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report

is_manual = True

# loop data df
for i in task4:

  col = task4[i]['col']
  X = list(task4[i]['data'][col])
  y = list(task4[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:

    NB_model = GaussianNB()
    NB_model.fit(X, y)
    y_pred = NB_model.predict(X_test)
    print(task4[i]['language_model'])
    print(classification_report(y_pred, y_test, digits = 4))

    print('\n')

## **Naive Bayes HPOs**

Grid search - NB

In [None]:
NB_grid ={
    'var_smoothing': (1e-01,1e-02,1e-03,1e-04,1e-05,1e-06,1e-07,1e-08,1e-09,1e-10,1e-11)}

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import GridSearchCV

is_manual = True

# loop data df
for i in task4:

  col = task4[i]['col']
  X = list(task4[i]['data'][col])
  y = list(task4[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf = GaussianNB()
    model = GridSearchCV(clf, NB_grid, cv=4, scoring='accuracy',n_jobs=-1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))

In [None]:
model.best_params_

Random search - NB

In [None]:
NB_random ={
    'var_smoothing': (1e-01,1e-02,1e-03,1e-04,1e-05,1e-06,1e-07,1e-08,1e-09,1e-10,1e-11)}

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import RandomizedSearchCV

is_manual = True

# loop data df
for i in task4:

  col = task4[i]['col']
  X = list(task4[i]['data'][col])
  y = list(task4[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf = GaussianNB()
    model =  RandomizedSearchCV(estimator = clf, param_distributions = NB_random, n_iter = 10, cv = 4, verbose= 2, random_state= 101, n_jobs = -1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))

In [None]:
model.best_params_

# **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
ks_selected_df = ros_data

# train test spilt 80/20 ratio
ks_df_train, ks_df_test, ks_df_y_train, ks_df_y_test = train_test_split(ks_selected_df, list(ks_selected_df['bool_HIGH']), test_size=0.2, random_state=0)

task5 = {
    "ks_df": {
        'data': ks_df_train,
        'col': 'content_bert_vector',
        'language_model' : 'BERT (DT)'
    },
    "kh_df": {
        'data': ks_df_train,
        'col': 'content_albert_vector',
        'language_model' : 'ALBERT (DT)'
    },
   "ka_df": {
        'data': ks_df_train,
        'col': 'content_roberta_vector',
        'language_model' : 'RoBERTa (DT)'
         },
}

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report

is_manual = True

# loop data df
for i in task5:

  col = task5[i]['col']
  X = list(task5[i]['data'][col])
  y = list(task5[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:

    DT_model = DecisionTreeClassifier(random_state=0)
    DT_model.fit(X, y)
    y_pred = DT_model.predict(X_test)
    print(task5[i]['language_model'])
    print(classification_report(y_pred, y_test, digits = 4))

    print('\n')

## **Decision Tree HPOs**

Grid search - DT

In [None]:
DT_grid ={
    'max_depth': [30,35,40],
    'min_samples_split': [5,10]}

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import GridSearchCV

is_manual = True

# loop data df
for i in task5:

  col = task5[i]['col']
  X = list(task5[i]['data'][col])
  y = list(task5[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf = DecisionTreeClassifier()
    model = GridSearchCV(clf, DT_grid, cv=4, scoring='accuracy',n_jobs=-1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))

In [None]:
model.best_params_

Random search - DT

In [None]:
DT_random ={
    'max_depth': [20,25,30],
    'min_samples_split': [5,10]}

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import GridSearchCV

is_manual = True

# loop data df
for i in task5:

  col = task5[i]['col']
  X = list(task5[i]['data'][col])
  y = list(task5[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf = DecisionTreeClassifier()
    model =  RandomizedSearchCV(estimator = clf, param_distributions = DT_random, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))

In [None]:
model.best_params_

# **K-Nearest Neighbors**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report

is_manual = True

# loop data df
for i in task6:

  col = task6[i]['col']
  X = list(task6[i]['data'][col])
  y = list(task6[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:

    KNN_model = KNeighborsClassifier(n_neighbors=3)
    KNN_model.fit(X, y)
    y_pred = KNN_model.predict(X_test)
    print(task6[i]['language_model'])
    print(classification_report(y_pred, y_test, digits = 4))

    print('\n')

## **K-Nearest Neighbors HPOs**

Grid search - KNN

In [None]:
KNN_grid ={
    'weights': ['uniform', 'distance'],
    'n_neighbors': [1,2,3]}

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import GridSearchCV

is_manual = True

# loop data df
for i in task6:

  col = task6[i]['col']
  X = list(task6[i]['data'][col])
  y = list(task6[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf = KNeighborsClassifier()
    model = GridSearchCV(clf, KNN_grid, cv=4, scoring='accuracy',n_jobs=-1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))

In [None]:
model.best_params_

Random search - KNN

In [None]:
KNN_random ={
    'weights': ['uniform', 'distance'],
    'n_neighbors': [1,2,3]}

In [None]:
 %%time
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report
from sklearn.model_selection import GridSearchCV

is_manual = True

# loop data df
for i in task6:

  col = task6[i]['col']
  X = list(task6[i]['data'][col])
  y = list(task6[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 520

  if is_manual:
    clf = KNeighborsClassifier()
    model =  RandomizedSearchCV(estimator = clf, param_distributions = KNN_random, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
    model.fit(X, y)
    predictionforest = model.best_estimator_.predict(X_test)
    print(confusion_matrix(y_test,predictionforest))
    print(classification_report(y_test,predictionforest,digits = 4))

In [None]:
model.best_params_

## **Additional Experiments**

###Random Under Sampling

In [None]:
ks_selected_df = ks_da_df

# train test spilt 70/30 ratio
ks_df_train, ks_df_test, ks_df_y_train, ks_df_y_test = train_test_split(ks_selected_df, list(ks_selected_df['bool_HIGH']), test_size=0.2, random_state=0)

data_zip = list(zip(ks_df_train['content_bert_vector'], ks_df_train['coffee_shop_name']))
rus = RandomUnderSampler(random_state=777)
rus_ks_df_train, rus_ks_df_train_y = rus.fit_resample(data_zip, ks_df_y_train)

rus_ks_df_train_0 = [row[0] for row in rus_ks_df_train]

ruc_bert = pd.DataFrame(list(zip(rus_ks_df_train_0, rus_ks_df_train_y)),
               columns =['content_bert_vector', 'bool_HIGH'])


data_zip = list(zip(ks_df_train['content_wangchanberta_vector'], ks_df_train['coffee_shop_name']))
rus = RandomUnderSampler(random_state=777)
rus_ks_df_train, rus_ks_df_train_y = rus.fit_resample(data_zip, ks_df_y_train)

rus_ks_df_train_0 = [row[0] for row in rus_ks_df_train]

ruc_wangchanberta = pd.DataFrame(list(zip(rus_ks_df_train_0, rus_ks_df_train_y)),
               columns =['content_wangchanberta_vector', 'bool_HIGH'])



data_zip = list(zip(ks_df_train['content_roberta_vector'], ks_df_train['coffee_shop_name']))
rus = RandomUnderSampler(random_state=777)
rus_ks_df_train, rus_ks_df_train_y = rus.fit_resample(data_zip, ks_df_y_train)

rus_ks_df_train_0 = [row[0] for row in rus_ks_df_train]

ruc_roberta = pd.DataFrame(list(zip(rus_ks_df_train_0, rus_ks_df_train_y)),
               columns =['content_roberta_vector', 'bool_HIGH'])


task_ruc = {
    "ks_df": {
        'data': ruc_bert,
        'col': 'content_bert_vector',
        'language_model' : 'BERT (RandomUnderSampling)'
    },
    
    "kh_df": {
        'data': ruc_wangchanberta,
        'col': 'content_wangchanberta_vector',
        'language_model' : 'WangchanBERTa (RandomUnderSampling)'
    },
    
    "ka_df": {
        'data': ruc_roberta,
        'col': 'content_roberta_vector',
        'language_model' : 'RoBERTa (RandomUnderSampling)'
    }
}

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support , classification_report

is_manual = True

# loop data df
for i in task_ruc:

  col = task_ruc[i]['col']
  X = list(task_ruc[i]['data'][col])
  y = list(task_ruc[i]['data']['bool_HIGH'])

  X_test = list(ks_df_test[col])
  y_test = list(ks_df_test['bool_HIGH'])

  max_num_iter = 500

  if is_manual:

    logreg_model = LogisticRegression(max_iter=max_num_iter, random_state=0,multi_class='multinomial')
    logreg_model.fit(X, y)
    y_pred = logreg_model.predict(X_test)
    print(task_ruc[i]['language_model'])
    print(classification_report(y_pred, y_test, digits = 4))

    print('\n')

# **TF-IDF Experiments**

# Import dataset

In [None]:
#Import Training and Testing Data
train = pd.read_csv(path + '/ratings_and_sentiments UTF-8.csv')
print("Training Set:"% train.columns, train.shape, len(train))
test = pd.read_csv(path + '/ratings_and_sentiments UTF-8.csv')
print("Test Set:"% test.columns, test.shape, len(test))

In [None]:
#Percentage of High/Low Sentiment
print("High: ", train.bool_HIGH.value_counts()[1]/len(train)*100,"%")
print("Low: ", train.bool_HIGH.value_counts()[0]/len(train)*100,"%")

# Preprocessing

In [None]:
porter=PorterStemmer()
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
def review_text_cleaner(review_text):
    soup = BeautifulSoup(review_text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    #Stemming
    stem_sentence=[]
    for word in words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    words="".join(stem_sentence).strip()
    return words
nums = [0,len(train)]
clean_review_text = []
for i in range(nums[0],nums[1]):
    clean_review_text.append(review_text_cleaner(train['review_text'][i]))
nums = [0,len(test)]
test_review_text = []
for i in range(nums[0],nums[1]):
    test_review_text.append(review_text_cleaner(test['review_text'][i])) 
train_clean = pd.DataFrame(clean_review_text,columns=['review_text'])
train_clean['bool_HIGH'] = train.bool_HIGH
train_clean['coffee_shop_name'] = train.coffee_shop_name
test_clean = pd.DataFrame(test_review_text,columns=['review_text'])
test_clean['coffee_shop_name'] = test.coffee_shop_name

# Feature Extraction

In [None]:
#split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_clean['review_text'],train_clean['bool_HIGH'])
#label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [None]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000)
tfidf_vect.fit(train_clean['review_text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# Model Evaluation

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
accuracyORIGINAL = train_model(linear_model.LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial'),xtrain_tfidf, train_y, xvalid_tfidf,valid_y)
print ("LR_Org", accuracyORIGINAL)

accuracyORIGINAL = train_model(RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y)
print ("RF_Org", accuracyORIGINAL)

accuracyORIGINAL = train_model(svm.LinearSVC(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y)
print ("SVM_Org", accuracyORIGINAL)

accuracyORIGINAL = train_model(MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y)
print ("NB_Org", accuracyORIGINAL)

accuracyORIGINAL = train_model(DecisionTreeClassifier(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y)
print ("DT_Org", accuracyORIGINAL)

accuracyORIGINAL = train_model(KNeighborsClassifier(n_neighbors=3), xtrain_tfidf, train_y, xvalid_tfidf, valid_y)
print ("KNN_Org", accuracyORIGINAL)

## Random Over Sampling

In [None]:
#Random Over Sampling
ros = RandomOverSampler(random_state=777)
ros_xtrain_tfidf, ros_train_y = ros.fit_resample(xtrain_tfidf, train_y)

accuracyROS = train_model(linear_model.LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial'),ros_xtrain_tfidf, ros_train_y, xvalid_tfidf,valid_y)
print ("LR_ROS", accuracyROS)

accuracyROS = train_model(RandomForestClassifier(),ros_xtrain_tfidf, ros_train_y, xvalid_tfidf,valid_y)
print ("RF_ROS", accuracyROS)

accuracyROS = train_model(svm.LinearSVC(),ros_xtrain_tfidf, ros_train_y, xvalid_tfidf,valid_y)
print ("SVM_ROS", accuracyROS)

accuracyROS = train_model(MultinomialNB(),ros_xtrain_tfidf, ros_train_y, xvalid_tfidf,valid_y)
print ("NB_ROS", accuracyROS)

accuracyROS = train_model(DecisionTreeClassifier(random_state=0),ros_xtrain_tfidf, ros_train_y, xvalid_tfidf,valid_y)
print ("DT_ROS", accuracyROS)

accuracyROS = train_model(KNeighborsClassifier(n_neighbors=3),ros_xtrain_tfidf, ros_train_y, xvalid_tfidf,valid_y)
print ("KNN_ROS", accuracyROS)

## TF-IDF HPOs

###Logistic Regression

Grid search - LR

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
LR_parameters ={
    'C': [20,30,40]}  

In [None]:
 %%time
clf = LogisticRegression()
model = GridSearchCV(clf, param_grid=LR_parameters, cv=4, scoring='accuracy',error_score=0, n_jobs=-1)
model.fit(xtrain_tfidf, train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf)
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

Random search - LR

In [None]:
 %%time
clf = LogisticRegression()
model = RandomizedSearchCV(estimator = clf, param_distributions = LR_parameters, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
model.fit(xtrain_tfidf, train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf)
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

### Random Forest

Grid search - RF

In [None]:
RF_parameters ={
    'max_depth': [50,75,100,125],
    'min_samples_split': [5,10,15],
    'n_estimators': [5,10,15,25]}

In [None]:
 %%time
clf = RandomForestClassifier()
model = GridSearchCV(clf, param_grid=RF_parameters, cv=4, scoring='accuracy',error_score=0, n_jobs=-1)
model.fit(xtrain_tfidf, train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf)
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

Random search - RF

In [None]:
RF_random ={
    'max_depth': [25,50,75,100,125,150],
    'min_samples_split': [5,10,15],
    'n_estimators': [5,10,15,25]}

In [None]:
 %%time
clf = RandomForestClassifier()
model = RandomizedSearchCV(estimator = clf, param_distributions = RF_random, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
model.fit(xtrain_tfidf, train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf)
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

### Naive Bayes

Grid search - NB

In [None]:
NB_TF ={
    'var_smoothing': (0.001,0.01,0.1,1,10,100)}

In [None]:
 %%time
clf = GaussianNB()
model = GridSearchCV(clf, param_grid= NB_TF, cv=4, scoring='accuracy',error_score=0, n_jobs=-1)
model.fit(xtrain_tfidf.toarray(), train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf.toarray())
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

Random search - NB

In [None]:
 %%time
clf = GaussianNB()
model = RandomizedSearchCV(estimator = clf, param_distributions = NB_TF, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
model.fit(xtrain_tfidf.toarray(), train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf.toarray())
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

### Support Vector Machine

Grid search - NB

In [None]:
SVM_parameters ={
    'C' : [0.1,1,10]}

In [None]:
 %%time
clf = svm.LinearSVC()
model = GridSearchCV(clf, param_grid= SVM_parameters, cv=4, scoring='accuracy',error_score=0, n_jobs=-1)
model.fit(xtrain_tfidf, train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf)
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

Random search - SVM

In [None]:
 %%time
clf = svm.LinearSVC()
model = RandomizedSearchCV(estimator = clf, param_distributions = SVM_parameters, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
model.fit(xtrain_tfidf, train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf)
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

###Decision Tree

In [None]:
DT_grid ={
    'max_depth': [5,10,15,20],
    'min_samples_split': [10,15,25]}

In [None]:
 %%time
clf = DecisionTreeClassifier()
model = GridSearchCV(clf, param_grid= DT_grid, cv=4, scoring='accuracy',error_score=0, n_jobs=-1)
model.fit(xtrain_tfidf, train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf)
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

Random Search - DT

In [None]:
DT_random ={
    'max_depth': [5,10,15,20],
    'min_samples_split': [10,15,25]}

In [None]:
 %%time
clf = DecisionTreeClassifier()
model = RandomizedSearchCV(estimator = clf, param_distributions = DT_random, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
model.fit(xtrain_tfidf, train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf)
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

###K-Nearest Neighbors

Grid search - KNN

In [None]:
KNN_parameters ={
    'weights': ['uniform', 'distance'],
    'n_neighbors': [2,4,6,8,10,12,14,16,18,20]}

In [None]:
 %%time
clf = KNeighborsClassifier()
model = GridSearchCV(clf, param_grid= KNN_parameters, cv=4, scoring='accuracy',error_score=0, n_jobs=-1)
model.fit(xtrain_tfidf, train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf)
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

Random search - KNN

In [None]:
 %%time
clf = KNeighborsClassifier()
model = RandomizedSearchCV(estimator = clf, param_distributions = KNN_parameters, n_iter = 10, cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
model.fit(xtrain_tfidf, train_y)
prediction = model.best_estimator_.predict(xvalid_tfidf)
print(confusion_matrix(valid_y,prediction))
print(classification_report(valid_y,prediction,digits = 4))

In [None]:
model.best_params_

Additional Experiments for TF-IDF

In [None]:
#SMOTE one of imbalance data handling technique
sm = SMOTE(random_state=777) #, ratio = 1.0)
sm_xtrain_tfidf, sm_train_y = sm.fit_resample(xtrain_tfidf, train_y)

accuracySMOTE = train_model(linear_model.LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial'),sm_xtrain_tfidf, sm_train_y, xvalid_tfidf,valid_y)
print ("LR_SM", accuracySMOTE)
accuracySMOTE = train_model(svm.LinearSVC(),sm_xtrain_tfidf, sm_train_y, xvalid_tfidf,valid_y)
print ("SVM_SM", accuracySMOTE)

accuracySMOTE = train_model(RandomForestClassifier(),sm_xtrain_tfidf, sm_train_y, xvalid_tfidf,valid_y)
print ("RF_SM", accuracySMOTE)