In [1]:
import pandas as pd


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords


from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_excel("data.xlsx",sheet_name = 'Form 2')

In [3]:
df.head()

Unnamed: 0,Text,Author,Domain
0,Photo-realistic image rendering using standard...,Human-written,Computer Science and Artificial Intelligence
1,The Large Hadron Collider is exploring physics...,Human-written,Computer Science and Artificial Intelligence
2,With the rapid development of mobile Internet ...,Human-written,Computer Science and Artificial Intelligence
3,Most contour tracking methods can be grouped i...,Human-written,Computer Science and Artificial Intelligence
4,Year 2010 is regarded as the breakthrough year...,Human-written,Computer Science and Artificial Intelligence


In [4]:
df.drop('Domain', inplace = True, axis = 1)

In [5]:
df.duplicated().sum()

np.int64(4)

In [6]:
df.drop_duplicates(inplace = True)

In [7]:
df.isnull().sum()

Text      0
Author    0
dtype: int64

In [8]:
df['Author'].value_counts()

Author
ChatGPT-generated    3607
Mixed text           3606
Human-written        3604
Name: count, dtype: int64

In [9]:
df['Author'] = df['Author'].map({'Human-written' : 1, 'Mixed text' : 2, 'ChatGPT-generated' : 3})

In [10]:
df.head()

Unnamed: 0,Text,Author
0,Photo-realistic image rendering using standard...,1
1,The Large Hadron Collider is exploring physics...,1
2,With the rapid development of mobile Internet ...,1
3,Most contour tracking methods can be grouped i...,1
4,Year 2010 is regarded as the breakthrough year...,1


In [11]:
stop_words = set(stopwords.words("english"))

def process(text):
    if not isinstance(text, str):
        return ""
    return " ".join(
        lemmatizer.lemmatize(w)
        for w in word_tokenize(text.lower())
        if w not in stop_words
    )

df.iloc[:,0] = df.iloc[:,0].apply(process)

In [12]:
df.head()

Unnamed: 0,Text,Author
0,photo-realistic image rendering using standard...,1
1,large hadron collider exploring physic energy ...,1
2,"rapid development mobile internet technology ,...",1
3,"contour tracking method grouped two category ,...",1
4,year 2010 regarded breakthrough year 3d video ...,1


In [13]:
X = df['Text']
Y = df['Author']

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42)

In [15]:
vectorizer = TfidfVectorizer()

In [16]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test) 

In [17]:
model = LogisticRegression()
params = {
    'penalty' : ['l1', 'l2', 'elasticnet', None],
    'dual' : [True, False],
    'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'multi_class' : ['auto', 'ovr', 'multinomial'],
}

In [18]:
random = RandomizedSearchCV(estimator = model, param_distributions = params, n_iter = 30, n_jobs = -1)

In [19]:
random.fit(X_train_vec, Y_train)

0,1,2
,estimator,LogisticRegression()
,param_distributions,"{'dual': [True, False], 'multi_class': ['auto', 'ovr', ...], 'penalty': ['l1', 'l2', ...], 'solver': ['lbfgs', 'liblinear', ...]}"
,n_iter,30
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [20]:
random.score(X_train_vec, Y_train)

0.6928007889546351

In [21]:
random.score(X_test_vec, Y_test)

0.6687615526802219

In [22]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(random, f)
with open("vectorizer.pkl", "wb") as fv:
    pickle.dump(vectorizer, fv)