In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, preprocessing, metrics, feature_extraction
from catboost import CatBoostClassifier
import joblib
from nltk.tokenize import word_tokenize
import string
from catboost import Pool
from scipy import spatial

In [2]:
EMBEDDING_DIM = 300

In [3]:
df = pd.read_csv('./data/big_dataset.csv')
df.drop_duplicates(inplace=True)
X = df.text
y = df.label
X = X.str.replace('\n', '')

In [4]:
def load_glove_vectors(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index


glove_file_path = './glove.42B.300d.txt'
glove_embeddings = load_glove_vectors(glove_file_path)

In [5]:
def preprcess_text(x: str):
    x = x.translate(str.maketrans("", "", string.punctuation))
    x = x.lower()
    tokens = word_tokenize(x)
    return tokens

In [6]:
def essay_to_vector(sentence, glove_embeddings):
    essay_vector = np.mean([glove_embeddings[word] for word in sentence if word in glove_embeddings], axis=0)
    return essay_vector

In [7]:
def make_embeddings(X: pd.Series):
    X_tokens = X.apply(preprcess_text)
    X_embedded = np.zeros((len(X_tokens),EMBEDDING_DIM), dtype=np.float32)
    for i, sentence in enumerate(X_tokens):
        X_embedded[i] = essay_to_vector(sentence, glove_embeddings)
    return X_embedded

In [8]:
def find_closest_embeddings(embedding):
    return sorted(glove_embeddings.keys(), key=lambda word: spatial.distance.euclidean(glove_embeddings[word], embedding))

In [9]:
X_embedded = make_embeddings(X)

In [10]:
def preprocess_input(input:str):
    input = input.replace('\n', '')
    input = preprcess_text(input)
    vector = essay_to_vector(input, glove_embeddings)
    return vector

In [11]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_embedded, y, test_size=0.2, random_state=42, shuffle=True)

In [12]:
from sklearn.utils.class_weight import compute_class_weight
 
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [13]:
train_pool = Pool(X_train, np.array(y_train))
val_poll = Pool(X_val, np.array(y_val))

In [14]:
clf = CatBoostClassifier(iterations=1000, learning_rate=0.01, loss_function='Logloss', class_weights=class_weights, eval_metric='AUC')

In [15]:
clf.fit(X_train, y_train, early_stopping_rounds=20, plot=True, eval_set=(X_val, y_val))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.9220539	best: 0.9220539 (0)	total: 98ms	remaining: 1m 37s
1:	test: 0.9321607	best: 0.9321607 (1)	total: 140ms	remaining: 1m 10s
2:	test: 0.9425446	best: 0.9425446 (2)	total: 184ms	remaining: 1m 1s
3:	test: 0.9449916	best: 0.9449916 (3)	total: 240ms	remaining: 59.8s
4:	test: 0.9465261	best: 0.9465261 (4)	total: 313ms	remaining: 1m 2s
5:	test: 0.9476812	best: 0.9476812 (5)	total: 369ms	remaining: 1m 1s
6:	test: 0.9476928	best: 0.9476928 (6)	total: 417ms	remaining: 59.1s
7:	test: 0.9484875	best: 0.9484875 (7)	total: 460ms	remaining: 57s
8:	test: 0.9498277	best: 0.9498277 (8)	total: 512ms	remaining: 56.4s
9:	test: 0.9505387	best: 0.9505387 (9)	total: 564ms	remaining: 55.8s
10:	test: 0.9508818	best: 0.9508818 (10)	total: 616ms	remaining: 55.4s
11:	test: 0.9509675	best: 0.9509675 (11)	total: 658ms	remaining: 54.1s
12:	test: 0.9515954	best: 0.9515954 (12)	total: 702ms	remaining: 53.3s
13:	test: 0.9523763	best: 0.9523763 (13)	total: 748ms	remaining: 52.7s
14:	test: 0.9522248	best: 0

<catboost.core.CatBoostClassifier at 0x7f89fb7d8ca0>

In [16]:
preds = clf.predict(X_val)
print(metrics.classification_report(y_val, preds))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      5915
           1       0.95      0.95      0.95      3533

    accuracy                           0.96      9448
   macro avg       0.96      0.96      0.96      9448
weighted avg       0.96      0.96      0.96      9448



In [23]:
a = clf.predict_proba(X_train[8])

In [24]:
"".join(str(np.int32(a[1]*100)))+"% AI generated text"

'99% AI generated text'

In [26]:
joblib.dump(clf, './modelv3.pkl')

['./padlav3.pkl']

In [30]:
joblib.dump(glove_embeddings, './glove_embeddings300.pkl')

['./glove_embeddings300.pkl']