In [1]:
# Load the extracted features
import pandas as pd

training_data = pd.read_csv("./dataset/features/data_original/train_features0.csv")
testing_data = pd.read_csv("./dataset/features/data_original/test_features0.csv")

training_data.shape, testing_data.shape

((506545, 161), (101309, 161))

In [3]:
# randomize the data

training_data = training_data.sample(frac=1).reset_index(drop=True)
testing_data = testing_data.sample(frac=1).reset_index(drop=True)

# train-test split
training_features, training_labels = training_data.drop("Label", axis=1), training_data["Label"]
testing_features, testing_labels = testing_data.drop("Label", axis=1), testing_data["Label"]

training_features.shape, testing_features.shape

((506545, 160), (101309, 160))

In [4]:
# separate BERT features from the rest
training_bert_features = training_features.filter(regex="^BERT")
training_features_no_bert = training_features.drop(columns=training_bert_features.columns)

testing_bert_features = testing_features.filter(regex="^BERT")
testing_features_no_bert = testing_features.drop(columns=testing_bert_features.columns)

training_bert_features.shape, training_features_no_bert.shape, testing_bert_features.shape, testing_features_no_bert.shape

((506545, 8), (506545, 152), (101309, 8), (101309, 152))

In [10]:
training_labels.value_counts(), testing_labels.value_counts()

(Label
 1    361178
 0    145367
 Name: count, dtype: int64,
 Label
 1    72236
 0    29073
 Name: count, dtype: int64)

In [17]:
# train a logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import cross_val_score
import numpy as np

logistic_regression = LogisticRegression(max_iter=1000, random_state=4248)
logistic_regression.fit(training_features, training_labels)

# cross validation
# mean_cv_score = cross_val_score(logistic_regression, training_features, training_labels, cv=5, scoring="f1_macro").mean()
# print(f"Mean Cross Validation Score: {mean_cv_score}")

# evaluate the model
predicted_labels = logistic_regression.predict(testing_features)

# print f1-score
print(f1_score(testing_labels, predicted_labels, average="macro"))

# print the classification report
print("With all features:")
print(classification_report(testing_labels, predicted_labels))

# without BERT features
logistic_regression_no_bert = LogisticRegression(max_iter=1000, random_state=4248)
logistic_regression_no_bert.fit(training_features_no_bert, training_labels)

# evaluate the model
predicted_labels_no_bert = logistic_regression_no_bert.predict(testing_features_no_bert)

# print the classification report
print("Without BERT features:")
print(classification_report(testing_labels, predicted_labels_no_bert))

# with only BERT features
logistic_regression_bert = LogisticRegression(max_iter=1000, random_state=4248)
logistic_regression_bert.fit(training_bert_features, training_labels)

# evaluate the model
predicted_labels_bert = logistic_regression_bert.predict(testing_bert_features)

print(np.bincount(predicted_labels_bert))

# print the classification report
print("With only BERT features:")
print(classification_report(testing_labels, predicted_labels_bert))


0.7776247813656241
With all features:
              precision    recall  f1-score   support

           0       0.75      0.60      0.67     29073
           1       0.85      0.92      0.88     72236

    accuracy                           0.83    101309
   macro avg       0.80      0.76      0.78    101309
weighted avg       0.82      0.83      0.82    101309

Without BERT features:
              precision    recall  f1-score   support

           0       0.75      0.60      0.67     29073
           1       0.85      0.92      0.88     72236

    accuracy                           0.83    101309
   macro avg       0.80      0.76      0.78    101309
weighted avg       0.82      0.83      0.82    101309

[     0 101309]
With only BERT features:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     29073
           1       0.71      1.00      0.83     72236

    accuracy                           0.71    101309
   macro avg       0.36   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# knn
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(training_features, training_labels)

# evaluate the model
predicted_labels_knn = knn.predict(testing_features)    

# print the classification report
print("With all features:")
print(classification_report(testing_labels, predicted_labels_knn))

# without BERT features
knn_no_bert = KNeighborsClassifier(n_neighbors=5)

knn_no_bert.fit(training_features_no_bert, training_labels)

# evaluate the model
predicted_labels_knn_no_bert = knn_no_bert.predict(testing_features_no_bert)

# print the classification report
print("Without BERT features:")
print(classification_report(testing_labels, predicted_labels_knn_no_bert))

# with only BERT features
knn_bert = KNeighborsClassifier(n_neighbors=5)
knn_bert.fit(training_bert_features, training_labels)

# evaluate the model
predicted_labels_knn_bert = knn_bert.predict(testing_bert_features)
print(np.bincount(predicted_labels_knn_bert))

# print the classification report
print("With only BERT features:")
print(classification_report(testing_labels, predicted_labels_knn_bert))

With all features:
              precision    recall  f1-score   support

           0       0.81      0.72      0.76     29073
           1       0.89      0.93      0.91     72236

    accuracy                           0.87    101309
   macro avg       0.85      0.83      0.84    101309
weighted avg       0.87      0.87      0.87    101309

Without BERT features:
              precision    recall  f1-score   support

           0       0.81      0.73      0.77     29073
           1       0.90      0.93      0.91     72236

    accuracy                           0.87    101309
   macro avg       0.86      0.83      0.84    101309
weighted avg       0.87      0.87      0.87    101309

[14902 86407]
With only BERT features:
              precision    recall  f1-score   support

           0       0.29      0.15      0.19     29073
           1       0.71      0.85      0.78     72236

    accuracy                           0.65    101309
   macro avg       0.50      0.50      0.49    

In [16]:
# naive bayes
from sklearn.naive_bayes import MultinomialNB

# min-max scale the features as multinomial naive bayes requires non-negative values
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_training_features = scaler.fit_transform(training_features)
scaled_testing_features = scaler.transform(testing_features)

naive_bayes = MultinomialNB()

naive_bayes.fit(scaled_training_features, training_labels)

# evaluate the model
predicted_labels_naive_bayes = naive_bayes.predict(scaled_testing_features)

# print the classification report
print("With all features:")
print(classification_report(testing_labels, predicted_labels_naive_bayes))

# without BERT features
scaled_training_features_no_bert = scaler.fit_transform(training_features_no_bert)
scaled_testing_features_no_bert = scaler.transform(testing_features_no_bert)

naive_bayes_no_bert = MultinomialNB()

naive_bayes_no_bert.fit(scaled_training_features_no_bert, training_labels)

# evaluate the model
predicted_labels_naive_bayes_no_bert = naive_bayes_no_bert.predict(scaled_testing_features_no_bert)

# print the classification report
print("Without BERT features:")
print(classification_report(testing_labels, predicted_labels_naive_bayes_no_bert))

# with only BERT features
scaled_training_bert_features = scaler.fit_transform(training_bert_features)

scaled_testing_bert_features = scaler.transform(testing_bert_features)

naive_bayes_bert = MultinomialNB()

naive_bayes_bert.fit(scaled_training_bert_features, training_labels)

# evaluate the model
predicted_labels_naive_bayes_bert = naive_bayes_bert.predict(scaled_testing_bert_features)
print(np.bincount(predicted_labels_naive_bayes_bert))

# print the classification report
print("With only BERT features:")
print(classification_report(testing_labels, predicted_labels_naive_bayes_bert))

With all features:
              precision    recall  f1-score   support

           0       0.71      0.06      0.11     29073
           1       0.72      0.99      0.84     72236

    accuracy                           0.72    101309
   macro avg       0.71      0.52      0.47    101309
weighted avg       0.72      0.72      0.63    101309

Without BERT features:
              precision    recall  f1-score   support

           0       0.71      0.06      0.11     29073
           1       0.72      0.99      0.84     72236

    accuracy                           0.72    101309
   macro avg       0.71      0.52      0.47    101309
weighted avg       0.72      0.72      0.63    101309

[     0 101309]
With only BERT features:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     29073
           1       0.71      1.00      0.83     72236

    accuracy                           0.71    101309
   macro avg       0.36      0.50      0.42  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# svc
from sklearn.svm import SVC

svc = SVC()

svc.fit(training_features, training_labels)

# evaluate the model
predicted_labels_svc = svc.predict(testing_features)

# print the classification report
print("With all features:")
print(classification_report(testing_labels, predicted_labels_svc))

# # without BERT features
# svc_no_bert = SVC()

# svc_no_bert.fit(training_features_no_bert, training_labels)

# # evaluate the model
# predicted_labels_svc_no_bert = svc_no_bert.predict(testing_features_no_bert)

# # print the classification report
# print("Without BERT features:")
# print(classification_report(testing_labels, predicted_labels_svc_no_bert))

With all features:
              precision    recall  f1-score   support

           0       0.87      0.78      0.82     29073
           1       0.91      0.95      0.93     72236

    accuracy                           0.90    101309
   macro avg       0.89      0.87      0.88    101309
weighted avg       0.90      0.90      0.90    101309



In [10]:
# save the SVC model
import joblib

joblib.dump(svc, "./dataset/models/svc_model.pkl")

['./dataset/models/svc_model.pkl']