In [35]:
import pandas as pd
import random
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay

import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras import layers 

import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [36]:
df = pd.read_csv("AI_Human.csv")
df

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0
...,...,...
487230,Tie Face on Mars is really just a big misunder...,0.0
487231,The whole purpose of democracy is to create a ...,0.0
487232,I firmly believe that governments worldwide sh...,1.0
487233,I DFN't agree with this decision because a LFT...,0.0


In [37]:
df = df.dropna(axis=0)
df.sample(5)

Unnamed: 0,text,generated
240887,Distance learning is becoming an increasingly ...,1.0
226316,"IG this modern era, electronic dictionaries ha...",1.0
388191,"""What have Richard Simon, Jimmy Carter, Bob Do...",0.0
151133,"Imagine you were on an Atlantic, and it was st...",0.0
114223,"As an eighth grade student, I strongly believe...",1.0


In [38]:
X = df['text']
y = df['generated'].astype("int64")

y.value_counts()

generated
0    305797
1    181438
Name: count, dtype: int64

In [39]:
from tensorflow.keras.utils import to_categorical

y_Onehot =  to_categorical(y)

In [40]:
# splitting data to test and train sets

X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y_Onehot, test_size=0.25,random_state = 42)
print(len(X_train), len(X_test), len(y_train), len(y_test))

365426 121809 365426 121809


In [41]:
print(X_test)

['Real or Fake Feelings\n\n"Imagine being able to detect exactly hoe other people are feeling, even then they are trying to hide their emotions" (D\'Alto). The article by Nick D\'Alto,\n\n"Making Mona Lisa Smile" is about a née software, the Facial Action Coding System which can recognize emotions based on muscle movements. Some people think this technology can be useful for many industries, however it can have negative effects on some. The use of this technology to read the emotional expressions of students in a classroom is not valuable.\n\nStudents may have personal issues, and it is not necessary for classroom\'s to know the emotions an individual is going through. As the article explains hoe muscle movements connect at hoe someone feels, it states, "Beckman has classified six basic emotions happiness, surprise, anger, disgust, fear, and sadness..." ’D\'Alto). This demonstrates that the technology can only detect a fee emotion. The technology may get confused, and it could determin

In [42]:
# basic adjustment for vectorizer based on a dataset

max_vocab_length = 50000 # how many words our dictionary will include
max_length = 25 # how many words from a tweet will be included

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length=max_length,
                                    standardize='lower_and_strip_punctuation'
                                   )

In [43]:
# fit the text vectorizer to teh train data
text_vectorizer.adapt(X_train)

In [44]:
X_train = text_vectorizer(X_train)
X_test = text_vectorizer(X_test)

In [45]:
print(X_test)

tf.Tensor(
[[  536    26  2150 ...    23   248  3578]
 [  745   335   364 ...    89   808   682]
 [18468  8845  1447 ...     2   150     4]
 ...
 [    2    50    63 ...     5  1389   401]
 [  105    36    24 ...    15   759     4]
 [ 1690  1147    38 ...    13    94     8]], shape=(121809, 25), dtype=int64)


In [51]:
np.save('/Users/wangfan/Documents/wukm/Uncertainty Projects/AI_Human/X_test_AH.npy', X_test)
np.save('/Users/wangfan/Documents/wukm/Uncertainty Projects/AI_Human/y_test_AH.npy', y_test)

In [46]:
import keras
from keras.models import Sequential  
from keras.layers import Dense,Dropout,Flatten,LSTM,BatchNormalization

model = Sequential()

model.add(Embedding(max_vocab_length,128))

model.add(LSTM(units=64,dropout=0.2,recurrent_dropout=0.2,kernel_regularizer=keras.regularizers.l1(0.001)))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(2,activation='softmax'))

model.summary()  
print("")  




In [47]:
from keras.optimizers import SGD

callback = keras.callbacks.EarlyStopping(monitor = 'loss', patience = 3)

# 定義訓練方式  
sgd = SGD(learning_rate = 0.001, momentum = 0.95)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
  
# Train the model
train_history = model.fit(x=X_train, y=y_train, validation_split=0.2,callbacks=[callback], epochs=3, batch_size=128, verbose=1)

Epoch 1/3
[1m2284/2284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 34ms/step - accuracy: 0.8516 - loss: 0.6773 - val_accuracy: 0.9506 - val_loss: 0.1571
Epoch 2/3
[1m2284/2284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 32ms/step - accuracy: 0.9574 - loss: 0.1497 - val_accuracy: 0.9648 - val_loss: 0.1209
Epoch 3/3
[1m2284/2284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 32ms/step - accuracy: 0.9670 - loss: 0.1187 - val_accuracy: 0.9641 - val_loss: 0.1225


In [48]:
score = model.evaluate(X_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.12147257477045059
Test accuracy: 0.9643458127975464


In [49]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Make predictions on the test set
y_pred = model.predict(X_test)

# Convert predictions and true labels from one-hot encoding to class indices
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

[1m3807/3807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step
Confusion Matrix:
[[74617  1724]
 [ 2619 42849]]


In [50]:
model.save('/Users/wangfan/Documents/wukm/Uncertainty Projects/AI_Human/AH.keras')