# Library and Data

In [1]:
!pip install keras
!pip install tensorflow





In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.gaussian_process import GaussianProcessClassifier
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping
from collections import Counter, defaultdict

import nltk
import nltk as nlp
import string
import re


# Reading Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Skyhack/my_df.csv')

In [4]:
df

Unnamed: 0,cleaned_transcript,primary_call_reason
0,"agent:\ncustomer: hi, yeah i'm calling because...",voluntary cancel
1,"agent: for calling united airlines, my name i...",booking
2,agent: for calling united airlines customer s...,irrops
3,agent: for calling united airlines customer s...,upgrade
4,"agent:\ncustomer: hi sarah, thanks for taking ...",seating
...,...,...
71805,agent: for calling united airlines customer s...,post flight
71806,agent: for calling united airlines customer s...,upgrade
71807,agent: for calling united airlines customer s...,upgrade
71808,agent: for calling united airlines customer s...,other


In [5]:

# Drop rows with null values in the dataframe.
df.dropna(inplace=True)


In [6]:

df = df[df.primary_call_reason != 'other']


In [9]:
df

Unnamed: 0,cleaned_transcript,primary_call_reason
0,"agent:\ncustomer: hi, yeah i'm calling because...",voluntary cancel
1,"agent: for calling united airlines, my name i...",booking
2,agent: for calling united airlines customer s...,irrops
3,agent: for calling united airlines customer s...,upgrade
4,"agent:\ncustomer: hi sarah, thanks for taking ...",seating
...,...,...
71804,agent: for calling united airlines customer s...,mileage plus
71805,agent: for calling united airlines customer s...,post flight
71806,agent: for calling united airlines customer s...,upgrade
71807,agent: for calling united airlines customer s...,upgrade


In [7]:
import re

def preprocess_transcript(transcript):
    # Remove agent and customer identifiers
    transcript = re.sub(r"agent:|customer:", "", transcript)
    # Remove extra whitespace and line breaks
    transcript = re.sub(r"\s+", " ", transcript).strip()
    return transcript


df['cleaned_transcript'] = df['cleaned_transcript'].apply(preprocess_transcript)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_transcript'] = df['cleaned_transcript'].apply(preprocess_transcript)


In [8]:
x_train,x_test,y_train,y_test = train_test_split(df['cleaned_transcript'], df.primary_call_reason, test_size=0.2, random_state=2020)


# Logistic Regression Classifier

In [15]:

pipe = Pipeline([('vect', TfidfVectorizer()),
                 ('model', LogisticRegression(max_iter=1000))])

model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 20.86%


# Support Vector Classifier

In [16]:

pipe = Pipeline([('vect', TfidfVectorizer()),
                 ('model', LinearSVC())])

model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 19.03%


# Multinomial Naive Bayes Classifier

In [17]:
pipe = Pipeline([('vect', TfidfVectorizer()),
                 ('model', MultinomialNB())])

model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 20.31%


# Bernoulli Naive Bayes Classifier

In [18]:
pipe = Pipeline([('vect', TfidfVectorizer()),
                 ('model', BernoulliNB())])

model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 19.56%


# Stochastic Gradient Descent

In [9]:
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', SGDClassifier())])

model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 11.54%


# Decision Tree

In [10]:
pipe = Pipeline([('vect', TfidfVectorizer()),
                 ('model', DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = 10,
                                           splitter='best',
                                           random_state=2020))])

model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 19.29%


# Random Forest Classifier

In [11]:
pipe = Pipeline([
                 ('tfidf', TfidfVectorizer()),
                 ('model', RandomForestClassifier())])

model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 19.77%


# KNN Classifier

In [13]:
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', KNeighborsClassifier(n_neighbors = 20,weights = 'distance',algorithm = 'brute'))])

model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 16.68%


# LSTM

In [16]:
X = df.cleaned_transcript
Y = df.primary_call_reason

le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)


In [23]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)
max_words = 500
max_len = 75
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    num_classes = len(le.classes_)
    layer = Dense(num_classes,name='out_layer')(layer)
    layer = Activation('softmax')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model
model = RNN()

In [24]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model1.png')
model.compile(loss='sparse_categorical_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])


You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [25]:
model.fit(sequences_matrix,Y_train,batch_size=256,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.src.callbacks.History at 0x7e875c6313c0>

In [26]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Accuracy: {:0.2f}'.format(accr[1]))


Accuracy: 0.20


# XGBoost Classifier

In [15]:
!pip install nltk xgboost
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# 1. Preprocessing Function:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(tokens)

# Apply Preprocessing:
small_df['cleaned_transcript'] = small_df['cleaned_transcript'].apply(preprocess_text)

# 2. Define the Pipeline:
# Remove LabelEncoder from the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('classifier', XGBClassifier(n_estimators=100, random_state=42)),
])

# 3. Train-Test Split:
x_train, x_test, y_train, y_test = train_test_split(
    small_df['cleaned_transcript'], small_df.primary_call_reason, test_size=0.2, random_state=2020
)

# 4. Train the Pipeline:
# Fit the label encoder separately on the training target variable
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# Fit the pipeline using the encoded target variable
pipeline.fit(x_train, y_train_encoded)

# 5. Make Predictions:
# Transform the test target variable using the fitted label encoder
y_test_encoded = le.transform(y_test)
predictions = pipeline.predict(x_test)

# 6. Evaluate the Model:
# Use the encoded target variables for evaluation
accuracy = accuracy_score(y_test_encoded, predictions)
print("Accuracy: {}%".format(round(accuracy * 100, 2)))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_df['cleaned_transcript'] = small_df['cleaned_transcript'].apply(preprocess_text)


Accuracy: 21.06%
