## Group 7 NLP

In [1]:
#!pip install spacy
#!pip install plotly
#!pip install nltk
#!pip install wordcloud

In [2]:
# We import the libraries
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report,  roc_curve, auc
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer

from tqdm import tqdm # duration counter
import nltk # nlp tools with nltk
from nltk.corpus import stopwords # corpus' stopwords

import string # for alphabet letters, punctuations and unicode characters
from math import log2 # to apply logarithm scale

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import spacy
import re

In [3]:
# download stopwords for later usage
nltk.download('stopwords')

# download tagger for pos tags
nltk.download('averaged_perceptron_tagger')

# download wordnet
nltk.download('wordnet')

# installing the medium english model of spacy
!python3 -m spacy download en_core_web_md -q
!pip install pyspellchecker -q

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
# load the spacy model
nlp = spacy.load('en_core_web_md')
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text) #remove punctuation
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text


## Preprocessing

In [5]:
# We load our dataset
# online banking queries annotated with their corresponding intents.
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
print("test data shape",test.shape)
test.sample(5)

In [None]:
print("train data shape",train.shape)
train.sample(5)


So we have 10003 observations in the training set and 3080 in the test set, 2 variables (text and category).

In [None]:
# Check the NaN values
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
#Check the types of the variables
print(train.columns)
train.info()

In [None]:
print(test.columns)
test.info()

In [None]:
# Check for duplicated rows
len(train.loc[train.duplicated()])

In [None]:
len(test.loc[test.duplicated()])

We don't have duplicated rows.

In [None]:
print("The number of categories in our dataset:",train['category'].nunique())

In [None]:
category_counts = train['category'].value_counts().reset_index()
category_counts.columns = ['category','count']

fig = px.bar(
    category_counts.sort_values('count',ascending=False),
    x = 'count',
    y = 'category',
    orientation = 'h',
    title = 'Repartition of the 77 categories in the training dataset'
)

fig.update_layout(height=800, width=800)
fig.show()

We can see that out train dataset is not balanced, some categories are more represented than the other ones.

In [None]:
# Apply the cleaning
train['text_clean'] = train['text'].apply(clean_text)
test['text_clean'] = test['text'].apply(clean_text)

In [None]:
#Vectorization with TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X_train = vectorizer.fit_transform(train['text_clean'])
X_test = vectorizer.transform(test['text_clean'])

y_train = train['category']
y_test = test['category']

In [None]:
# Model training (LogisticRegression)
model = LogisticRegression(max_iter=1000, class_weight='balanced') #balanced to adjust the weight and resolve the problem of unbalanced data.
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

report1 = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
def predict_category(text):
    text_cleaned = clean_text(text)
    text_v = vectorizer.transform([text_cleaned])
    prediction = model.predict(text_v)
    return prediction[0]

In [None]:
sentence = "I am seeing a payment that I don't recognize."
predicted_category = predict_category(sentence)

print("Predicted category: ",predicted_category)

In [None]:
sent2 = "I lost my card PIN, I want to change it."
predicted_category = predict_category(sent2)

print("Predicted category: ",predicted_category)

In [None]:
sent3 = "I want to pay an item in my mobile video game, which app can I use? (Google)"
predicted_category = predict_category(sent3)

print("Predicted category: ",predicted_category)

In [None]:
sent4 = "I didn't find my phone since 2 weeks, is my bank account safe?"
predicted_category = predict_category(sent4)

print("Predicted category: ",predicted_category)

In [None]:
#!pip install gensim

### Using Embedding

In [None]:
#!pip install -U sentence-transformers
#!pip install -U tf-keras
#!pip uninstall keras keras-nightly tf-keras tf-keras-nightly tensorflow -y
#!pip install tf-keras-nightly --upgrade
!pip install transformers -q

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel
import torch

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def get_bert_embedding(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        output = bert_model(**encoded_input)
    #The CLS outut
    embeddings = output.last_hidden_state[:,0,:]
    return embeddings

In [None]:

X_train_e = get_bert_embedding(train['text_clean'].tolist())
X_test_e = get_bert_embedding(test['text_clean'].tolist())

y_train_e = train['category']
y_test_e = test['category']

X_train_e = X_train_e.numpy()
X_test_e = X_test_2.numpy()

In [None]:
model_e = LogisticRegression(max_iter=1000, class_weight='balanced')
model_e.fit(X_train_e, y_train_e)
y_pred_e = model_e.predict(X_test_e)

report2 = classification_report(y_test_e, y_pred_e, output_dict=True)
print(classification_report(y_test_e, y_pred_e))


In [None]:
def predict_category_e(text): #With embeddings
    text_cleaned = clean_text(text)
    text_v = embedder.encode([text_cleaned])
    prediction = model_e.predict(text_v)
    return prediction[0]


In [None]:
sentence = "I am seeing a payment that I don't recognize."
predicted_category = predict_category_e(sentence)

print("Predicted category: ",predicted_category)

In [None]:
sent2 = "I lost my code, I want to change it."
predicted_category = predict_category_e(sent2)

print("Predicted category: ",predicted_category)

In [None]:
sent3 = "I want to pay an item in my mobile video game, which app can I use?"
predicted_category = predict_category_e(sent3)

print("Predicted category: ",predicted_category)

In [None]:
sent4 = "I didn't find my phone since 2 weeks, is my bank account safe?"
predicted_category = predict_category(sent4)

print("Predicted category: ",predicted_category)

### Let's compare the results of these 2 classifications (The one using TF-IDF and the one using Embeddings)