In [3]:
import pandas as pd


In [6]:
df = pd.read_csv('/content/sample_data/sentences_pairs_original.csv')

In [7]:
# Create a new dataframe with combined columns
new_df = pd.DataFrame({
    'Text': pd.concat([df['sentence'], df['ai_sentence']]),
    'Label': [0] * len(df) + [1] * len(df)
})

print(new_df)

                                                   Text  Label
0        Error: Invalid URL 'link': No scheme supplied.      0
1                       Perhaps you meant https://link?      0
2     The map runs to sixteen laminated foolscap pag...      0
3     I have been given it on the condition that I d...      0
4     It is not like any map I have ever seen, and I...      0
...                                                 ...    ...
2543                  The aperture drastically narrows!      1
2544  The captain states, "Diving daDoria is akin to...      1
2545  "He advises the diver to fasten the mask strap...      1
2546   "Once, there existed a certain female entity..."      1
2547  Refashion the given phrase into an AI-like syn...      1

[5096 rows x 2 columns]


In [8]:
shuffled_df = new_df.sample(frac=1).reset_index(drop=True)
print(shuffled_df.head(50))

                                                 Text  Label
0                                 Their teeth clench.      0
1   I found myself uncertain about the anticipated...      1
2    Resources are to be shared, even with strangers.      0
3   Occasionally, on a few select occasions annual...      1
4   Following the conclusion of his occupied New Y...      1
5   Although Steves spends nearly half his life tr...      0
6   So when he finishes building a gyroscope out o...      0
7   The captain states, "Diving daDoria is akin to...      1
8   The latest photographic subject of the magazin...      1
9   On our final day in Paris, we regrettably lack...      1
10  Optimal, according to their unique specificati...      1
11  His preference leans towards casual dining est...      1
12  Post-dinner, upon readying for lodging at a ne...      1
13  It was an uncanny coincidence that the specifi...      1
14  His emotional state transitioned to a combinat...      1
15  You savor these last

In [9]:
import nltk
nltk.download('punkt_tab')

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation

# Download necessary NLTK data
nltk.download('punkt')
# Download the specific English language model
nltk.download('averaged_perceptron_tagger_eng') # This line is added

def preprocess_text(text):
    # Remove punctuation, special characters, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Perform POS tagging
    pos_tags = nltk.pos_tag(tokens)

    return pos_tags

# Apply the function to the 'Text' column of shuffled_df
shuffled_df['POS_Tags'] = shuffled_df['Text'].apply(lambda x: preprocess_text(x) if pd.notnull(x) else x)

print(shuffled_df.head(20))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


                                                 Text  Label  \
0                                 Their teeth clench.      0   
1   I found myself uncertain about the anticipated...      1   
2    Resources are to be shared, even with strangers.      0   
3   Occasionally, on a few select occasions annual...      1   
4   Following the conclusion of his occupied New Y...      1   
5   Although Steves spends nearly half his life tr...      0   
6   So when he finishes building a gyroscope out o...      0   
7   The captain states, "Diving daDoria is akin to...      1   
8   The latest photographic subject of the magazin...      1   
9   On our final day in Paris, we regrettably lack...      1   
10  Optimal, according to their unique specificati...      1   
11  His preference leans towards casual dining est...      1   
12  Post-dinner, upon readying for lodging at a ne...      1   
13  It was an uncanny coincidence that the specifi...      1   
14  His emotional state transitioned to 

In [10]:
shuffled_df.to_csv('/content/sample_data/shuffled_final_sentence.csv', index=False)

In [11]:
from collections import Counter

def normalize_pos_tags(pos_tags):
    # Count the frequency of each POS tag
    tag_counts = Counter(tag for word, tag in pos_tags)

    # Normalize the frequencies by the length of the sequence
    total_tags = sum(tag_counts.values())
    normalized_counts = {tag: count / total_tags for tag, count in tag_counts.items()}

    return normalized_counts

# Apply the function to the 'POS_Tags' column of shuffled_df
shuffled_df['Normalized_POS_Tags'] = shuffled_df['POS_Tags'].apply(lambda x: normalize_pos_tags(x) if isinstance(x, list) else x)

print(shuffled_df[['Text', 'Normalized_POS_Tags']].head())

                                                Text  \
0                                Their teeth clench.   
1  I found myself uncertain about the anticipated...   
2   Resources are to be shared, even with strangers.   
3  Occasionally, on a few select occasions annual...   
4  Following the conclusion of his occupied New Y...   

                                 Normalized_POS_Tags  
0  {'PRP$': 0.3333333333333333, 'NNS': 0.33333333...  
1  {'PRP': 0.25, 'VBD': 0.125, 'JJ': 0.25, 'IN': ...  
2  {'NNS': 0.25, 'VBP': 0.125, 'TO': 0.125, 'VB':...  
3  {'RB': 0.11428571428571428, 'IN': 0.1428571428...  
4  {'VBG': 0.07407407407407407, 'DT': 0.148148148...  


In [12]:
!pip install pickle
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# A. POS Tag Features
# Convert the normalized POS tag dictionary to a DataFrame
pos_features = pd.json_normalize(shuffled_df['Normalized_POS_Tags']).fillna(0)

# B. Text Features
# Initialize the TF-IDF Vectorizer with n-grams (unigrams and bigrams)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))


# Fit and transform the 'Text' column
tfidf_features = tfidf_vectorizer.fit_transform(shuffled_df['Text'].fillna(''))

# Convert the sparse matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# C. Combine Features
# Concatenate POS-based features with text-based features
combined_features = pd.concat([pos_features.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
# Save the combined features DataFrame
combined_features_path = 'combined_features.csv'
combined_features.to_csv(combined_features_path, index=False)

print(f'Combined features saved as {combined_features_path}')

# Save the TF-IDF vectorizer
tfidf_vectorizer_path = 'tfidf_vectorizer.pkl'
with open(tfidf_vectorizer_path, 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

print(f'TF-IDF vectorizer saved as {tfidf_vectorizer_path}')
print(combined_features.head())



[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0mCombined features saved as combined_features.csv
TF-IDF vectorizer saved as tfidf_vectorizer.pkl
       PRP$       NNS        NN       PRP       VBD        JJ        IN  \
0  0.333333  0.333333  0.333333  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.000000  0.125000  0.250000  0.125000  0.250000  0.125000   
2  0.000000  0.250000  0.000000  0.000000  0.000000  0.000000  0.125000   
3  0.028571  0.057143  0.142857  0.057143  0.028571  0.114286  0.142857   
4  0.037037  0.000000  0.185185  0.037037  0.037037  0.037037  0.148148   

         DT    VBP        TO  ...  zone where  zones  zones and  zones from  \
0  0.000000  0.000  0.000000  ...         0.0    0.0        0.0         0.0   
1  0.125000  0.000  0.000000  ...         0.0    0.0        0.0         0.0   
2  0.000000  0.125  0.125000  ...     

In [13]:
from sklearn.model_selection import train_test_split

# Split the data into training (80%) and temp (20%) sets
train_df, temp_df = train_test_split(shuffled_df, test_size=0.2, stratify=shuffled_df['Label'])

# Split the temp set into validation (10%) and test (10%) sets
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['Label'])

# Print the sizes of the splits
print(f'Training set size: {len(train_df)}')
print(f'Validation set size: {len(val_df)}')
print(f'Test set size: {len(test_df)}')

Training set size: 4076
Validation set size: 510
Test set size: 510


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score
import pickle

X_train = combined_features.loc[train_df.index]
y_train = train_df['Label']

X_val = combined_features.loc[val_df.index]
y_val = val_df['Label']

# Initialize the Logistic Regression model with L2 regularization
log_reg = LogisticRegression(penalty='l2', solver='liblinear', C=1.0)

# Train the model
log_reg.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = log_reg.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)

auc_roc = roc_auc_score(y_val, log_reg.predict_proba(X_val)[:, 1])

print(f'Validation AUC-ROC: {auc_roc}')
print(f'Validation Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Save the trained Logistic Regression model
model_filename = 'lg_ai_det.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(log_reg, file)

print(f'Model saved as {model_filename}')

Validation AUC-ROC: 0.929042675893887
Validation Accuracy: 0.8450980392156863
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       255
           1       0.84      0.85      0.85       255

    accuracy                           0.85       510
   macro avg       0.85      0.85      0.85       510
weighted avg       0.85      0.85      0.85       510

Model saved as lg_ai_det.pkl


In [15]:
df_structure = combined_features
# Extract column names and save them as a DataFrame
column_names_df = pd.DataFrame(columns=df_structure.columns)

# Save the column names DataFrame to a CSV file
column_names_df.to_csv('combined_features_columns.csv', index=False)

In [16]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import pickle

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load the trained Random Forest model
model_path = 'lg_ai_det.pkl'  # Path to your saved model
with open(model_path, 'rb') as file:
    rf_model = pickle.load(file)

# Load the TF-IDF vectorizer
tfidf_vectorizer_path = 'tfidf_vectorizer.pkl'  # Path to your saved TF-IDF vectorizer
with open(tfidf_vectorizer_path, 'rb') as file:
    tfidf_vectorizer = pickle.load(file)

# Define the combined features DataFrame structure
combined_features_columns = pd.read_csv('combined_features_columns.csv').columns


def preprocess_text(text):
    # Remove punctuation, special characters, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Perform POS tagging
    pos_tags = nltk.pos_tag(tokens)

    return pos_tags

def normalize_pos_tags(pos_tags):
    # Count the frequency of each POS tag
    tag_counts = Counter(tag for word, tag in pos_tags)

    # Normalize the frequencies by the length of the sequence
    total_tags = sum(tag_counts.values())
    normalized_counts = {tag: count / total_tags for tag, count in tag_counts.items()}

    return normalized_counts

def predict_human_or_not(text):
    # Preprocess the input text
    pos_tags = preprocess_text(text)
    normalized_pos_tags = normalize_pos_tags(pos_tags)

    # Convert the normalized POS tag dictionary to a DataFrame
    pos_features = pd.json_normalize(normalized_pos_tags).fillna(0)

    # Ensure the columns match those used during training
    pos_features = pos_features.reindex(columns=combined_features_columns[:pos_features.shape[1]], fill_value=0)

    # Transform the input text using the TF-IDF vectorizer
    tfidf_features = tfidf_vectorizer.transform([text])
    tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    # Ensure the columns match those used during training
    tfidf_df = tfidf_df.reindex(columns=combined_features_columns[pos_features.shape[1]:], fill_value=0)

    # Combine POS-based features with text-based features
    combined_input_features = pd.concat([pos_features.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

    # Predict using model
    prediction = rf_model.predict(combined_input_features)
    prediction_proba = rf_model.predict_proba(combined_input_features)

    return "Not AI" if prediction[0] == 0 else "AI", prediction_proba[0]

# Example usage
input_text = """
The combination of POS tag features and TF-IDF vectorization offers a robust approach to text classification. POS tags provide insights into the grammatical structure of the text, which can differ between human and AI writing styles. For instance, AI-generated text might have more consistent and predictable grammar, while human text might be more varied.

TF-IDF vectorization, on the other hand, captures the importance of specific words within the text, which can help identify unique vocabulary patterns. For example, AI-generated text might use a more limited vocabulary or repeat certain phrases more frequently.

By integrating these features, the model can leverage both grammatical and lexical information to make more accurate classifications. This multi-faceted approach enhances the model's ability to detect subtle differences between human and AI text, improving overall classification performance.
"""
result, proba = predict_human_or_not(input_text)
print(f"Prediction: {result}")
print(f"Probability of Not AI: {round(proba[0]*100)} %")
print(f"Probability of AI: {(proba[1])*100} %")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Prediction: AI
Probability of Not AI: 32 %
Probability of AI: 68.4816031473547 %
