In [1]:

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

In [2]:

# Load your data
essays = pd.read_csv('train_essays.csv')


In [3]:

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    
    # Remove numbers
    text = re.sub(r'\d', ' ', text)
    
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # Remove leading and trailing spaces
    text = text.strip()
    
    # Tokenize the text
    tokens = text.split()
    
    # Remove stopwords and stem the words
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens if word not in stopwords.words('english')]
    
    return ' '.join(tokens)


In [None]:

# Preprocess your data
essays['cleaned_text'] = essays['text'].apply(preprocess_text)


# Print the number of preprocessed texts
print(f"Number of preprocessed texts: {len(essays['cleaned_text'])}")


X = essays[['id', 'prompt_id', 'cleaned_text']]
y = essays['generated']


In [None]:
# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train['cleaned_text'])
X_test_vec = vectorizer.transform(X_test['cleaned_text'])


# Train your model
model = LogisticRegression()
model.fit(X_train_vec, y_train)


# Make predictions
predictions = model.predict(X_test_vec)


# Add predictions to X_test
X_test['predicted'] = predictions
X_test['actual'] = y_test


In [None]:

# Compare actual and predicted values
comparison = X_test[['id', 'actual', 'predicted']]
print(comparison)


In [None]:

# Get class probabilities
probabilities = model.predict_proba(X_test_vec)

# Add probabilities to X_test
X_test['prob_0'] = probabilities[:, 0]
X_test['prob_1'] = probabilities[:, 1]

X_test['avg_prob'] = X_test[['prob_0', 'prob_1']].mean(axis=1)


print(X_test)

In [None]:

# AUC - ROC curve
def plot_roc(y_true, y_score, label, ax):
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    ax.plot(fpr, tpr)
    ax.plot([0, 1], [0, 1], color='black', linestyle='--')
    ax.set_ylabel('TPR')
    ax.set_xlabel('FPR')
    ax.set_title(f"{label} AUC = {roc_auc_score(y_true, y_score):.2f}")



In [None]:

# auc-roc curve chart for training set
fig, ax = plt.subplots(1, 2, figsize=(6, 3))

plot_roc(y_test, predictions, [y], ax=ax[0])

fig.tight_layout()
plt.show()


In [None]:

# Evaluate your model
print(f'Accuracy: {accuracy_score(y_test, predictions) * 100:.2f} %')
