# 1. Experimenting with Logistic Regression

In [1]:
# prompt: give access to drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/Pre-processed dataset

/content/drive/MyDrive/Pre-processed dataset


In [4]:
!ls

binary_processed_test.csv  binary_processed_train.csv  processed_test.csv  processed_train.csv


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [7]:
binary_data = pd.read_csv("binary_processed_train.csv")
#binary_data.select_dtypes('O').columns
#binary_data.isnull().sum()
binary_data.dropna(inplace = True)

Index(['category', 'utterance1', 'utterance2', 'utterance1_user',
       'utterance1_intent', 'utterance1_text', 'utterance2_user',
       'utterance2_intent', 'utterance2_text', 'category_name'],
      dtype='object')

In [43]:
X = binary_data.drop(columns = 'label', axis = 1)
y = binary_data['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train['utterance1'] + " " + X_train['utterance2'] + X_train['category'])
X_val_vec = vectorizer.transform(X_val['utterance1'] + " " + X_val['utterance2'] + X_val['category'])

In [None]:
## Adjusting class weights is a key to improve performance
## For class 0---> 1 (Proportion of class0 data in train.csv is 70% )
## For class 1---> 3 (Proportion of class1 data in train.csv is 30% )

In [44]:
model = LogisticRegression(class_weight = {0:1, 1:3})
model.fit(X_train_vec, y_train)

## Input Parameters
1) utterance1

2) utterance2

3) category


In [45]:
y_pred = model.predict(X_val_vec)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.78      0.82       892
           1       0.58      0.73      0.64       376

    accuracy                           0.76      1268
   macro avg       0.72      0.75      0.73      1268
weighted avg       0.78      0.76      0.77      1268



## Input parameters
1) utterance1

2) utterance2_text

3) category

In [59]:
X = binary_data.drop(columns = 'label', axis = 1)
y = binary_data['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train['utterance1'] + " " + X_train['utterance2_text'] + " " + X_train['category'])
X_val_vec = vectorizer.transform(X_val['utterance1'] + " " + X_val['utterance2_text'] + " " + X_val['category'])

In [55]:
model = LogisticRegression(class_weight = {0:1, 1:3})
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_val_vec)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.76      0.81       892
           1       0.56      0.72      0.63       376

    accuracy                           0.75      1268
   macro avg       0.71      0.74      0.72      1268
weighted avg       0.78      0.75      0.76      1268



## Observation:

- Using utterance1 vs utterance2's text part alone performs really better than utterance1 vs utterance2
- category term doesn't play a significant role (no change in scores)



# 2. Experimenting with SVM

In [66]:
from sklearn.svm import SVC
X_train_vec = vectorizer.fit_transform(X_train['utterance1'] + " " + X_train['utterance2_text'] + X_train['category'])
X_val_vec = vectorizer.transform(X_val['utterance1'] + " " + X_val['utterance2_text'] + X_val['category'])
# Create the SVM model with class weights
svm_model = SVC(class_weight={0: 1, 1: 3})
svm_model.fit(X_train_vec, y_train)
y_pred = svm_model.predict(X_val_vec)
print(classification_report(y_val, y_pred))

'''
For instance, a random model's result is:
● Label 0: Precision: 0.71, Recall: 0.49, F1: 0.58
● Label 1: Precision: 0.29, Recall: 0.51, F1: 0.37
● Overall: Precision: 0.50, Recall: 0.50 F1: 0.48
'''

              precision    recall  f1-score   support

           0       0.84      0.84      0.84       892
           1       0.62      0.62      0.62       376

    accuracy                           0.78      1268
   macro avg       0.73      0.73      0.73      1268
weighted avg       0.78      0.78      0.78      1268



"\nFor instance, a random model's result is:\n● Label 0: Precision: 0.71, Recall: 0.49, F1: 0.58\n● Label 1: Precision: 0.29, Recall: 0.51, F1: 0.37\n● Overall: Precision: 0.50, Recall: 0.50 F1: 0.48\n"

## Observation:

- Input utterance1, utterance2's text part alone with category performs good than other combinations

# 3. Experimenting with Random Forest Classifier

In [68]:
from sklearn.ensemble import RandomForestClassifier
X_train_vec = vectorizer.fit_transform(X_train['utterance1'] + " " + X_train['utterance2_text'] + X_train['category'])
X_val_vec = vectorizer.transform(X_val['utterance1'] + " " + X_val['utterance2_text'] + X_val['category'])
# Create the Random Forest model with class weights
rf_model = RandomForestClassifier(class_weight={0: 1, 1: 3})
rf_model.fit(X_train_vec, y_train)
y_pred = rf_model.predict(X_val_vec)
print(classification_report(y_val, y_pred))

'''
For instance, a random model's result is:
● Label 0: Precision: 0.71, Recall: 0.49, F1: 0.58
● Label 1: Precision: 0.29, Recall: 0.51, F1: 0.37
● Overall: Precision: 0.50, Recall: 0.50 F1: 0.48
'''

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       892
           1       0.74      0.44      0.55       376

    accuracy                           0.79      1268
   macro avg       0.77      0.69      0.71      1268
weighted avg       0.78      0.79      0.77      1268



"\nFor instance, a random model's result is:\n● Label 0: Precision: 0.71, Recall: 0.49, F1: 0.58\n● Label 1: Precision: 0.29, Recall: 0.51, F1: 0.37\n● Overall: Precision: 0.50, Recall: 0.50 F1: 0.48\n"

## Observation:
- Even though the overall performance of RF is better than LR and SVM, it fails with class 1 recall threshold
- adjusting the class weights didn't improve the score
