In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import torch
import nltk
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')


import os.path

from tqdm.autonotebook import tqdm, trange


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ashwintan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_parquet("../raw data/combined_data_with_embeddings.parquet")

In [3]:
df.head()

Unnamed: 0,Text,Label,Original dataset,Row in original dataset,embedding
0,The idea of graduating high school in three ye...,Machine,essays,26613,"[0.01476596, -0.013095475, 0.002932728, -0.011..."
1,"Hey, I'm so excited to write this essay about ...",Machine,essays,26326,"[0.00292786, -0.013083563, 0.0047025573, -0.00..."
2,Introduction\n\nSelf-reliance is a concept tha...,Machine,essays,30579,"[-0.015756093, -0.023221416, -0.010959062, -0...."
3,"Sure, here's my attempt at writing an essay as...",Machine,essays,33547,"[0.013002162, 0.011017485, -0.03551094, 0.0290..."
4,The legalization of marijuana is a highly deba...,Machine,essays,33768,"[0.0016188276, 0.011155421, -0.004596148, 0.00..."


In [4]:
df.Label.value_counts()

Label
Machine    40000
Human      40000
Name: count, dtype: int64

In [5]:
from sentence_transformers import SentenceTransformer 

In [69]:
sentence_model = SentenceTransformer("thenlper/gte-large")

def get_sentence_embedding(text):
    if isinstance(text,str):
        if not text.strip(): 
            print("Attempted to get embedding for empty text.")
            return []
        
        return sentence_model.encode(text)
    
    elif isinstance(text,list):
        return [get_sentence_embedding(x) for x in text]
        

In [18]:
text = "This is a fake review."
text2 = "This is not a fake review."

embedding = get_sentence_embedding(text)
type(embedding)

numpy.ndarray

In [19]:
embedding

array([-0.00613883, -0.01058691,  0.00132789, ..., -0.0222865 ,
       -0.00615894, -0.02127941], dtype=float32)

In [20]:
a = [text, text2]
get_sentence_embedding(a)

[array([-0.00613883, -0.01058691,  0.00132789, ..., -0.0222865 ,
        -0.00615894, -0.02127941], dtype=float32),
 array([ 0.00717259,  0.00192547, -0.01863514, ..., -0.01114186,
         0.01110466, -0.00416853], dtype=float32)]

In [6]:
train, test = train_test_split(df, test_size=0.2)

In [7]:
train.head()

Unnamed: 0,Text,Label,Original dataset,Row in original dataset,embedding
25075,WGN America has acquired the exclusive U.S. ho...,Machine,grover,10235,"[0.008065161, 0.00068244926, -0.04635188, 0.00..."
67522,Joseph Bryan Nelson MBE FRSE (14 March 1932 – ...,Human,wiki,89287,"[-0.01591434, 0.010300461, -0.0018833067, 0.00..."
44480,This was a beautiful movie. I love the movie....,Machine,reviews,15163,"[0.02572593, -0.013680295, -0.012741473, -0.00..."
53199,This is a slim case designed for all models of...,Human,reviews,9756,"[-0.02133946, 0.0022242079, -0.0017145402, -0...."
69006,The Alliance Tire Company (Pvt.) Ltd. is a tir...,Human,wiki,47664,"[-0.032117695, 0.002097214, -0.020527093, 0.02..."


In [8]:
train.embedding[0]

array([ 0.01476596, -0.01309547,  0.00293273, ..., -0.00282018,
        0.02586309, -0.00610633], dtype=float32)

In [9]:
X_train = np.vstack(train.embedding.apply(lambda x: np.asarray(x).flatten()))
X_test = np.vstack(test.embedding.apply(lambda x: np.asarray(x).flatten()))

In [10]:
from sklearn.svm import SVC
clf = SVC(kernel='linear')

In [11]:
clf.fit(X_train, train.Label)

In [44]:
preds = clf.predict(X_test)

In [45]:
preds

array(['Human', 'Machine', 'Machine', 'Human', 'Human', 'Machine',
       'Human', 'Human', 'Human', 'Machine', 'Human', 'Human', 'Human',
       'Human', 'Human', 'Human', 'Machine', 'Human', 'Machine',
       'Machine', 'Human', 'Machine', 'Machine', 'Machine', 'Machine',
       'Human', 'Machine', 'Machine', 'Machine', 'Human', 'Human',
       'Machine', 'Machine', 'Machine', 'Machine', 'Human', 'Human',
       'Machine', 'Machine', 'Machine', 'Human', 'Human', 'Human',
       'Machine', 'Human', 'Human', 'Human', 'Machine', 'Human',
       'Machine', 'Human', 'Machine', 'Human', 'Human', 'Machine',
       'Machine', 'Machine', 'Human', 'Human', 'Machine', 'Machine',
       'Machine', 'Machine', 'Human', 'Human', 'Human', 'Human', 'Human',
       'Human', 'Machine', 'Machine', 'Human', 'Machine', 'Machine',
       'Machine', 'Machine', 'Machine', 'Machine', 'Human', 'Machine',
       'Machine', 'Human', 'Machine', 'Machine', 'Machine', 'Human',
       'Machine', 'Human', 'Machine', 

In [46]:
confusion_matrix(test.Label, preds)

array([[26, 22],
       [18, 34]])

In [48]:
from sklearn.metrics import accuracy_score

In [50]:
accuracy_score(test.Label, preds)

0.6