In [4]:
%pip install pandas matplotlib seaborn nltk

Collecting matplotlib
  Using cached matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.57.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (102 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached pillow-11.1.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.1 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Collecting 

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import os

In [20]:
# Load the dataset
df = pd.read_csv("raw/AI_Human.csv")  # Replace "your_dataset.csv"

### Initial Data Inspection
##### 1. Shape and Size
##### 2. Missing Values
##### 3. First Few Rows
##### 4. Descriptive Statistics

In [None]:
print(df.shape) # Shape and Size
print(df.dtypes) # Data Types of each column
print(df.isnull().sum())  # Missing values per column
print(df.head()) # First few rows
print(df.describe()) # Descriptive Statistics

(487235, 2)
text          object
generated    float64
dtype: object
text         0
generated    0
dtype: int64
                                                text  generated
0  Cars. Cars have been around since they became ...        0.0
1  Transportation is a large necessity in most co...        0.0
2  "America's love affair with it's vehicles seem...        0.0
3  How often do you ride in a car? Do you drive a...        0.0
4  Cars are a wonderful thing. They are perhaps o...        0.0
           generated
count  487235.000000
mean        0.372383
std         0.483440
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000


### Text-specific EDA 

#### Basic Text Statistics:
##### 1. Document Length: Calculate the length of each text (number of characters, number of words, number of sentences).
##### 2. Average Word Length: Calculate the average length of words in each text.
##### 3. Vocabulary Size: Determine the number of unique words in each text and in the entire dataset.
##### 4. Sentence Length: Calculate the number of words per sentence.

In [7]:
df['char_count'] = df['text'].str.len()
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
#df['sentence_count'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))

def avg_word_len(text):
    words = text.split()
    return sum(len(word) for word in words) / len(words) if len(words) > 0 else 0

df['avg_word_length'] = df['text'].apply(avg_word_len)

print(df[['char_count', 'word_count', 'avg_word_length']].describe())

# Vocabulary size (number of unique words)
all_words = ' '.join(df['text']).split()
unique_words = set(all_words)
print(f"Vocabulary size: {len(unique_words)}")

          char_count     word_count  avg_word_length
count  487235.000000  487235.000000    487235.000000
mean     2269.586592     393.096214         4.755985
std       988.814028     168.593328         0.521039
min         1.000000       0.000000         0.000000
25%      1583.000000     278.000000         4.415595
50%      2102.000000     363.000000         4.685144
75%      2724.000000     471.000000         5.020000
max     18322.000000    1668.000000       126.000000
Vocabulary size: 544828


### Dataset Splitting Function

##### Set the desired ratios

In [10]:
# Set the desired ratios
train_ratio = 1000
val_ratio = 100
test_ratio = 100
total_ratio = train_ratio + val_ratio + test_ratio

In [15]:
# Desired subset size (adjust as needed)
subset_size = 10000

# Calculate the number of subsets
num_subsets = len(df) // subset_size  # Integer division

##### Calculate the sizes of each split

In [11]:
# Calculate the sizes of each split
train_size = int(train_ratio / total_ratio * len(df))
val_size = int(val_ratio / total_ratio * len(df))
test_size = len(df) - train_size - val_size

In [17]:
# Shuffle the data once before creating subsets
data = df.sample(frac=1, random_state=42).reset_index(drop=True) # frac=1 means shuffle all rows

In [13]:
##### Create the splits
train_data = df.iloc[:train_size]
val_data = df.iloc[train_size:train_size + val_size]
test_data = df.iloc[train_size + val_size:]

In [21]:
# Create the main directory
main_dir = "subsets"
os.makedirs(main_dir, exist_ok=True)

In [23]:
# Create the subsets
for i in range(num_subsets):
    # Create subfolder for each subset
    subset_dir = os.path.join(main_dir, f"subset_{i+1}")
    os.makedirs(subset_dir, exist_ok=True)

    # Get the subset of the data
    start_index = i * subset_size
    end_index = (i + 1) * subset_size
    subset = data.iloc[start_index:end_index]

    # Calculate the sizes of each split within the subset
    train_size = int(train_ratio / total_ratio * len(subset))
    val_size = int(val_ratio / total_ratio * len(subset))
    test_size = len(subset) - train_size - val_size
    
        # Split the subset into train, validation, and test sets
    train_data = subset.iloc[:train_size]
    val_data = subset.iloc[train_size:train_size + val_size]
    test_data = subset.iloc[train_size + val_size:]

    # Print the sizes of the resulting datasets for this subset
    print(f"Subset {i+1}:")
    print(f"  Train data size: {len(train_data)}")
    print(f"  Validation data size: {len(val_data)}")
    print(f"  Test data size: {len(test_data)}")

    # Save the sub-datasets to CSV files in the subset directory
    train_data.to_csv(os.path.join(subset_dir, 'train.csv'), index=False)
    val_data.to_csv(os.path.join(subset_dir, 'validation.csv'), index=False)
    test_data.to_csv(os.path.join(subset_dir, 'test.csv'), index=False)
    
# Handle any remaining data (less than subset_size)
if len(data) % subset_size != 0:
    # Create subfolder for remaining data
    remaining_dir = os.path.join(main_dir, "subset_remaining")
    os.makedirs(remaining_dir, exist_ok=True)

    start_index = num_subsets * subset_size
    remaining_data = data.iloc[start_index:]

    train_size = int(train_ratio / total_ratio * len(remaining_data))
    val_size = int(val_ratio / total_ratio * len(remaining_data))
    test_size = len(remaining_data) - train_size - val_size

    train_data = remaining_data.iloc[:train_size]
    val_data = remaining_data.iloc[train_size:train_size + val_size]
    test_data = remaining_data.iloc[train_size + val_size:]

    print("Remaining Data Subset:")
    print(f"  Train data size: {len(train_data)}")
    print(f"  Validation data size: {len(val_data)}")
    print(f"  Test data size: {len(test_data)}")

    train_data.to_csv(os.path.join(remaining_dir, 'train.csv'), index=False)
    val_data.to_csv(os.path.join(remaining_dir, 'validation.csv'), index=False)
    test_data.to_csv(os.path.join(remaining_dir, 'test.csv'), index=False)

Subset 1:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 2:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 3:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 4:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 5:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 6:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 7:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 8:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 9:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 10:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 11:
  Train data size: 8333
  Validation data size: 833
  Test data size: 834
Subset 12:
  Train data size: 8333
  Validation data size: 833
  Test data