In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load the data into a pandas dataframe
data_path = '/home/arsh/Jasleen/Spring 2023/NLP/MajorFinalProject/data/'
df=pd.read_csv(data_path+"input.csv")

# Split the data into training and testing sets
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

[nltk_data] Downloading package punkt to /home/arsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/arsh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Preprocess the text data and extract features
def preprocess_text(text):
    # Perform any text preprocessing tasks here (e.g., tokenization, stop word removal, etc.)
    # Tag the words with their respective POS tags
    pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
    # Convert the POS tags into a string of text
    pos_text = ' '.join([tag[1] for tag in pos_tags])
    # Perform any additional text preprocessing tasks here (e.g., lowercase)
    preprocessed_text = pos_text.lower()
    return preprocessed_text

In [3]:
# Use TfidfVectorizer to extract text features
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data['text'].apply(preprocess_text))
X_test = vectorizer.transform(test_data['text'].apply(preprocess_text))


In [4]:
X_train

<9381x33 sparse matrix of type '<class 'numpy.float64'>'
	with 210316 stored elements in Compressed Sparse Row format>

In [6]:
# Add POS tagging features to the feature matrix
pos_tags_train = train_data['text'].apply(preprocess_text)
pos_tags_test = test_data['text'].apply(preprocess_text)
X_train_pos = vectorizer.fit_transform(pos_tags_train)
X_test_pos = vectorizer.transform(pos_tags_test)
X_train = np.concatenate((X_train.toarray(), X_train_pos.toarray()), axis=1)
X_test = np.concatenate((X_test.toarray(), X_test_pos.toarray()), axis=1)



In [7]:
from sklearn.metrics import accuracy_score,classification_report

# Train the random classifier
y_train = train_data['class']
y_test = test_data['class']
clf = RandomForestClassifier(n_estimators=500, random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Print the classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.6933901918976546
              precision    recall  f1-score   support

        ctrl       0.98      0.98      0.98       216
        fair       0.45      0.46      0.45       209
         gpt       0.95      0.99      0.97       205
        gpt2       0.51      0.54      0.52       201
        gpt3       0.43      0.35      0.39       213
      grover       0.44      0.59      0.50       205
       human       0.68      0.64      0.66       207
 instructgpt       0.59      0.60      0.60       231
        pplm       0.68      0.54      0.60       236
         xlm       0.97      1.00      0.98       212
       xlnet       0.97      0.97      0.97       210

    accuracy                           0.69      2345
   macro avg       0.70      0.70      0.69      2345
weighted avg       0.70      0.69      0.69      2345



In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train the Naive Bayes classifier
y_train = train_data['class']
y_test = test_data['class']
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Print the classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.531769722814499
              precision    recall  f1-score   support

        ctrl       0.84      0.81      0.82       216
        fair       0.32      0.22      0.26       209
         gpt       0.76      1.00      0.86       205
        gpt2       0.27      0.43      0.33       201
        gpt3       0.32      0.25      0.28       213
      grover       0.33      0.32      0.32       205
       human       0.49      0.39      0.44       207
 instructgpt       0.50      0.55      0.52       231
        pplm       0.36      0.25      0.30       236
         xlm       0.79      0.90      0.84       212
       xlnet       0.75      0.76      0.75       210

    accuracy                           0.53      2345
   macro avg       0.52      0.53      0.52      2345
weighted avg       0.52      0.53      0.52      2345



In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the Logistic Regression classifier
y_train = train_data['class']
y_test = test_data['class']
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Print the classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.6029850746268657
              precision    recall  f1-score   support

        ctrl       0.90      0.95      0.93       216
        fair       0.35      0.44      0.39       209
         gpt       0.89      0.99      0.93       205
        gpt2       0.42      0.42      0.42       201
        gpt3       0.34      0.22      0.27       213
      grover       0.35      0.32      0.33       205
       human       0.44      0.49      0.46       207
 instructgpt       0.51      0.62      0.56       231
        pplm       0.49      0.35      0.41       236
         xlm       0.91      0.98      0.94       212
       xlnet       0.93      0.88      0.90       210

    accuracy                           0.60      2345
   macro avg       0.59      0.60      0.59      2345
weighted avg       0.59      0.60      0.59      2345



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
