# Feature Set Exploration and Creation

In [None]:
!pip install lmppl

from google.colab import drive
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
import math
import nltk
from nltk.tokenize import word_tokenize
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
from textblob import TextBlob
from tqdm import tqdm
import lmppl

drive.mount('/content/drive')
nltk.download('punkt')

ai_df = pd.read_csv("drive/MyDrive/AI_Human.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In the data above, 1 means AI generated and 0 means human generated.

In [None]:
feature_set = pd.DataFrame({})
feature_set["orig_str"] = ai_df['text']
feature_set['label'] = ai_df['generated'].astype(int)

feature_set = pd.concat([feature_set[feature_set['label'] == 0][:181500], feature_set[feature_set['label'] == 1]])
feature_set = feature_set.sample(frac=1).reset_index(drop=True)

feature_set["word_count"] = [len(x.split(" ")) for x in feature_set["orig_str"]]
feature_set

Unnamed: 0,orig_str,label,word_count
0,"Hey there! So, I'm super stoked to be writing ...",1,322
1,Phones & Driving\n\nThere is a growing discuss...,0,401
2,It Cs a common belief that setting a goal hig...,1,401
3,In the current society of rapid developments a...,1,416
4,Dear Principle.\n\nI think having the phone on...,0,208
...,...,...,...
362933,The Electoral College has been used for years....,0,485
362934,Requiring students a summer project to extend ...,0,360
362935,"In the ""challenge of exploring Venus"" the auth...",0,430
362936,"In the modern day, technology has become an in...",1,349


In [None]:
# drop rows where the length of orig_str is less than 2
feature_set = feature_set[feature_set['orig_str'].str.len() >= 2]

In [None]:
def avg_sentence_length(text):
    sentences = sent_tokenize(text)
    total_length = sum(len(sentence.split()) for sentence in sentences)
    return total_length / len(sentences) if len(sentences) > 0 else 0

feature_set['avg_sentence_length'] = feature_set['orig_str'].apply(avg_sentence_length)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_set['avg_sentence_length'] = feature_set['orig_str'].apply(avg_sentence_length)


In [None]:
from textblob import TextBlob

feature_set["sentiment_polarity"] = [TextBlob(text).sentiment.polarity for text in feature_set['orig_str']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_set["sentiment_polarity"] = [TextBlob(text).sentiment.polarity for text in feature_set['orig_str']]


In [None]:
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

def count_stopwords_and_punctuation(text):
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    word_tokens = nltk.word_tokenize(text)

    stop_words_count = sum(1 for word in word_tokens if word.lower() in stop_words)

    punctuation_count = sum(1 for char in text if char in punctuation)

    return stop_words_count, punctuation_count


# Apply the function to each row in the DataFrame
feature_set['stopwords_count'], feature_set['punctuation_count'] = zip(*feature_set['orig_str'].apply(count_stopwords_and_punctuation))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_set['stopwords_count'], feature_set['punctuation_count'] = zip(*feature_set['orig_str'].apply(count_stopwords_and_punctuation))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_set['stopwords_count'], feature_set['punctuation_count'] = zip(*feature_set['orig_str'].apply(count_stopwords_and_punctuation))


In [None]:
!pip install textstat
from textstat import flesch_reading_ease
from textstat import flesch_kincaid_grade

# Calculate Flesch Reading Ease
feature_set['flesch_reading_ease'] = feature_set['orig_str'].apply(flesch_reading_ease)
feature_set['flesch_kincaid_grade'] = feature_set['orig_str'].apply(flesch_kincaid_grade)

feature_set



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_set['flesch_reading_ease'] = feature_set['orig_str'].apply(flesch_reading_ease)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_set['flesch_kincaid_grade'] = feature_set['orig_str'].apply(flesch_kincaid_grade)


Unnamed: 0,orig_str,label,word_count,avg_sentence_length,sentiment_polarity,stopwords_count,punctuation_count,flesch_reading_ease,flesch_kincaid_grade
0,"Hey there! So, I'm super stoked to be writing ...",1,322,13.583333,0.209305,157,71,73.78,6.5
1,Phones & Driving\n\nThere is a growing discuss...,0,401,31.153846,0.012037,195,46,56.93,13.0
2,It Cs a common belief that setting a goal hig...,1,401,16.833333,0.223518,165,43,71.34,7.5
3,In the current society of rapid developments a...,1,416,18.954545,0.202367,195,41,60.65,9.5
4,Dear Principle.\n\nI think having the phone on...,0,208,42.400000,-0.026786,119,13,43.06,20.4
...,...,...,...,...,...,...,...,...,...
362933,The Electoral College has been used for years....,0,485,21.260870,0.224405,267,59,75.24,8.1
362934,Requiring students a summer project to extend ...,0,360,22.687500,0.177562,173,42,56.89,11.0
362935,"In the ""challenge of exploring Venus"" the auth...",0,430,35.833333,0.098493,225,24,60.52,13.7
362936,"In the modern day, technology has become an in...",1,349,24.928571,0.058025,119,38,31.72,14.4


In [None]:
scorer = lmppl.LM('gpt2')
feature_set["perplexity"] = [scorer.get_perplexity(x) for x in feature_set['orig_str']]
feature_set

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
100%|██████████| 1/1 [00:00<00:00, 64.18it/s]
100%|██████████| 1/1 [00:00<00:00, 52.13it/s]
100%|██████████| 1/1 [00:00<00:00, 54.30it/s]
100%|██████████| 1/1 [00:00<00:00, 35.45it/s]
100%|██████████| 1/1 [00:00<00:00, 64.60it/s]
100%|██████████| 1/1 [00:00<00:00, 56.30it/s]
100%|██████████| 1/1 [00:00<00:00, 61.87it/s]
100%|██████████| 1/1 [00:00<00:00, 62.72it/s]
100%|██████████| 1/1 [00:00<00:00, 51.42it/s]
100%|██████████| 1/1 [00:00<00:00, 63.72it/s]
100%|██████████| 1/1 [00:00<00:00, 52.26it/s]
100%|██████████| 1/1 [00:00<00:00, 43.05it/s]
100%|██████████| 1/1 [00:00<00:00, 65.97it/s]
100%|██████████| 1/1 [00:00<00:00, 56.33it/s]
100%|██████████| 1/1 [00:00<00:00, 56.11it/s]
100%|██████████| 1/1 [00:00<00:00, 50.24it/s]
100%|██████████| 1/1 [00:00<00:00, 40.37it/s]
100%|██████████| 1/1 [00:00<00:00, 56.70it/s]
100%|██████████| 1/1 [00:00<00:00, 40.06it/s]
100%|██████████| 1/1 [00:00<00:00, 56.87it/s]
100%|██████████

Unnamed: 0,orig_str,label,word_count,avg_sentence_length,sentiment_polarity,stopwords_count,punctuation_count,flesch_reading_ease,flesch_kincaid_grade,perplexity
0,"Hey there! So, I'm super stoked to be writing ...",1,322,13.583333,0.209305,157,71,73.78,6.5,17.585706
1,Phones & Driving\n\nThere is a growing discuss...,0,401,31.153846,0.012037,195,46,56.93,13.0,35.936166
2,It Cs a common belief that setting a goal hig...,1,401,16.833333,0.223518,165,43,71.34,7.5,10.100890
3,In the current society of rapid developments a...,1,416,18.954545,0.202367,195,41,60.65,9.5,15.344154
4,Dear Principle.\n\nI think having the phone on...,0,208,42.400000,-0.026786,119,13,43.06,20.4,33.218544
...,...,...,...,...,...,...,...,...,...,...
362933,The Electoral College has been used for years....,0,485,21.260870,0.224405,267,59,75.24,8.1,26.300501
362934,Requiring students a summer project to extend ...,0,360,22.687500,0.177562,173,42,56.89,11.0,37.513801
362935,"In the ""challenge of exploring Venus"" the auth...",0,430,35.833333,0.098493,225,24,60.52,13.7,54.269715
362936,"In the modern day, technology has become an in...",1,349,24.928571,0.058025,119,38,31.72,14.4,31.893595


In [None]:
def std_unique_words(text):
  words = text.split()
  unique_words = set(words)
  return np.std([words.count(word) for word in unique_words])

def count_personal_pronouns(text):
  personal_pronouns = ['i', 'me', 'my', 'mine', 'you', 'your', 'yours', 'he', 'him', 'his', 'she', 'her', 'hers', 'it', 'its', 'we', 'us', 'our', 'ours', 'they', 'them', 'their', 'theirs']
  words = text.split()
  count = 0
  for word in words:
    if word.lower() in personal_pronouns:
      count += 1
  return count

def get_subjectivity(text):
  return TextBlob(text).sentiment.subjectivity

def count_unique_pos_tags(text):
  words = nltk.word_tokenize(text)
  pos_tags = nltk.pos_tag(words)
  unique_tags = set([tag for word, tag in pos_tags])
  return len(unique_tags)

feature_set['unique_pos_tags'] = feature_set['orig_str'].apply(count_unique_pos_tags)
feature_set['std_unique_words'] = feature_set['orig_str'].apply(std_unique_words)
feature_set['personal_pronoun_count'] = feature_set['orig_str'].apply(count_personal_pronouns)
feature_set['sentiment_subjectivity'] = feature_set['orig_str'].apply(get_subjectivity)
feature_set['quotation_marks_count'] = feature_set['orig_str'].str.count('"')
feature_set.to_csv("drive/MyDrive/feature_set_new.csv")
feature_set