<a href="https://colab.research.google.com/github/keisukecl/llm-detect-ai-generated-text/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import sys
import os
import shutil
from requests import get

pd.set_option('display.max_columns', None)

In [None]:
class Config:
    name = "EDA"
    # Colab Env
    upload_from_colab = True
    drive_path = "/content/drive/MyDrive/llm-detect-ai-generated-text"

    # Kaggle Env
    kaggle_dataset_path = None

    debug = False

In [None]:
COLAB = "google.colab" in sys.modules

In [None]:
if COLAB:
    print("This environment is Google Colab")

    # mount
    from google.colab import drive
    if not os.path.isdir("/content/drive"):
        drive.mount('/content/drive')

    # set dirs
    DRIVE = Config.drive_path
    EXP = (Config.name if Config.name is not None
           else get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
    INPUT = os.path.join(DRIVE, "Input")
    OUTPUT = os.path.join(DRIVE, "Output")
    SUBMISSION = os.path.join(DRIVE, "Submission")
    OUTPUT_EXP = os.path.join(OUTPUT, EXP)
    EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
    EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
    EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

    # make dirs
    for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)

else:
  print("This environment is Kaggle Kernel")

  # set dirs
  INPUT = "../Input/llm-detect-ai-generated-text"
  EXP, OUTPUT, SUBMISSION = "./", "./", "./"
  EXP_MODEL = os.path.join(EXP, "model")
  EXP_FIG = os.path.join(EXP, "fig")
  EXP_PREDS = os.path.join(EXP, "preds")

  # copy dirs
  if Config.kaggle_dataset_path is not None:
      KD_MODEL = os.path.join(Config.kaggle_dataset_path, "model")
      KD_EXP_PREDS = os.path.join(Config.kaggle_dataset_path, "preds")
      shutil.copytree(KD_MODEL, EXP_MODEL)
      shutil.copytree(KD_EXP_PREDS, EXP_PREDS)

  # make dirs
  for d in [EXP_MODEL, EXP_FIG, EXP_PREDS]:
      os.makedirs(d, exist_ok=True)

This environment is Google Colab
Mounted at /content/drive


In [35]:
# load data
train_essays = pd.read_csv(os.path.join(INPUT, "train_essays.csv" if COLAB else "train_logs.csv"))
train_prompts = pd.read_csv(os.path.join(INPUT, "train_prompts.csv" if COLAB else "train_scores.csv"))
test = pd.read_csv(os.path.join(INPUT, "test_essays.csv"))
#sample_submission = pd.read_csv(os.path.join(INPUT, "sample_submission.csv"))


In [36]:
train_essays.shape

(1378, 4)

In [37]:
train_essays.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


1378行のテキストがある

In [38]:
train_essays["prompt_id"].value_counts()

0    708
1    670
Name: prompt_id, dtype: int64

プロンプト0が与えられて作られたテキストは708  
プロンプト1が与えられて作られたテキストは670

In [39]:
train_essays["generated"].value_counts()

0    1375
1       3
Name: generated, dtype: int64

生徒が書いたエッセイは1375  
LLMが生成したエッセイは3つ


In [40]:
train_essays.groupby(['prompt_id','generated']).count().reset_index()

Unnamed: 0,prompt_id,generated,id,text
0,0,0,707,707
1,0,1,1,1
2,1,0,668,668
3,1,1,2,2


In [26]:
#生徒が書いたエッセイとLLMが生成したエッセイの単語数を確認する
# avgerage word length = sum of word length / number of words
def word_count(text):
    words = text.split()
    word_count = len(words)
    return word_count

# create a new column avg_word_length to store the data
train_essays['word_count'] = train_essays['text'].apply(word_count)
train_essays[['generated','word_count']].groupby(['generated']).median().reset_index()

Unnamed: 0,generated,word_count
0,0,525.0
1,1,258.0


In [27]:
def nr_unique_word(text):
    words = text.split()
    word_count = len(words)
    unique_word_count = len(set([w.lower() for w in words]))
    return unique_word_count

# create a new column uniq_word_count to store the data
train_essays['uniq_word_count'] = train_essays['text'].apply(nr_unique_word)
train_essays[['generated','uniq_word_count']].groupby(['generated']).median().reset_index()

Unnamed: 0,generated,uniq_word_count
0,0,251.0
1,1,117.0


In [42]:
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

# 単語数をカウントする関数
def count_words(text):
    words = text.split()
    word_count = len(words)
    return word_count

# 文の数をカウントする関数
def count_sentences(text):
    number_of_sentences = len(sent_tokenize(text))
    return number_of_sentences

# データフレームに新しい列 'word_count' を追加
train_essays['word_count'] = train_essays['text'].apply(count_words)

# データフレームに新しい列 'nr_sentences' を追加
train_essays['nr_sentences'] = train_essays['text'].apply(count_sentences)

# 'generated' 列でグループ化し、'word_count' 列の中央値を計算
word_count_result = train_essays[['generated', 'word_count']].groupby(['generated']).median().reset_index()

# 'generated' 列でグループ化し、'nr_sentences' 列の平均値を計算
sentence_count_result = train_essays[['generated', 'nr_sentences']].groupby(['generated']).mean().reset_index()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [43]:
word_count_result

Unnamed: 0,generated,word_count
0,0,525.0
1,1,258.0


In [44]:
sentence_count_result

Unnamed: 0,generated,nr_sentences
0,0,27.659636
1,1,12.333333


In [None]:
train_prompts.shape

(2, 4)

In [None]:
train_prompts

Unnamed: 0,prompt_id,prompt_name,instructions,source_text
0,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,1,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...
