In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#REMOVING DUPLICATES IN THE CSV FILES
import pandas as pd
def remove_duplicates(source, dest):

  data = pd.read_csv(source)
  data = data.drop_duplicates(subset=['text'])
  data.to_csv(dest, index=False)

  return

In [None]:
#SPLITTING UP THE DATA INTO 60% TRAINING, 20% VALIDATION AND 20% TESTING
import os
import numpy as np
import pandas as pd
import csv
import random

#in original csv file
#ROW 1 THROUGH 25996 is 0 -> human generated
#ROW 25997 THROUGH 65508 is 1 -> AI generated

def split_data(source, train_dest, val_dest, test_dest, train_size = 0.6, val_size = 0.2):

    #Data is already sorted beginning with 0 -> human generated
    #find the index at which point 1 begins -> AI generated

    #split the data
    Human_data = []
    AI_data = []

    with open(source, newline='') as csvfile:
      reader = csv.reader(csvfile, delimiter=',')
      next(reader, None)

      for row in reader:
        text, category = row[0], row[1]
        if category == '0':
          Human_data.append([text, category])
        elif category == '1':
          AI_data.append([text, category])

    #find indices to split the data for 60% train, 20% val, 20% test to ensure there is equal amount of AI and human data
    if len(Human_data) > len(AI_data):
      index = len(AI_data)
    else:
      index = len(Human_data)

    #make seperate lists for train, val and test data sets
    train_data = Human_data[:round(index * train_size)] + AI_data[:round(index * train_size)]
    val_data = Human_data[round(index*train_size):round(index * (train_size + val_size))] + AI_data[round(index*train_size):round(index * (train_size + val_size))]
    test_data = Human_data[round(index * (train_size + val_size)):] + AI_data[round(index * (train_size + val_size)):len(Human_data)]

    print(val_data)

    #shuffle the data sets so all AI don't appear right after each other vice versa for humans
    random.shuffle(train_data)
    random.shuffle(val_data)
    random.shuffle(test_data)

    #create 3 seperate csv files -> one for each data set (train, val, test)
    df_train = pd.DataFrame(train_data, columns=['text', 'category'])
    df_train.to_csv(train_dest, index=False)

    df_val = pd.DataFrame(val_data, columns=['text', 'category'])
    df_val.to_csv(val_dest, index=False)

    df_test = pd.DataFrame(test_data, columns=['text', 'category'])
    df_test.to_csv(test_dest, index=False)

    return

In [None]:
#take the csvfile containing the data and output a list of the form
# [ [ [word1], [word2], ...], category], [ [word1], [word2], ...], category], ...]

def csv_to_list(source):

  df = pd.read_csv(source)

  output_list = []

  for i, row in df.iterrows():
    text = row['text']
    category = row['category']
    if type(text) == str:
      words = text.split()
    output_list.append([words, category])

  return output_list


In [None]:
#find the average length of the each text

def average_length(source):

  lengths = []

  for i in range(len(source)):
    lengths.append(len(source[i][0]))

  return sum(lengths)/len(lengths)


In [None]:

#RUNNING THE TEXT PREPROCESSING AND DATA SPLITTING USING THE ABOVE FUNCTIONS

#removing the duplicates in the data
original_source = '/content/gdrive/MyDrive/ASP360Project/Data/train_v3_drcat_01.csv'
no_duplicates_source = '/content/gdrive/MyDrive/ASP360Project/Data/data_noduplicates.csv'

remove_duplicates(original_source,no_duplicates_source)


In [None]:
#splitting up the data into train, val, test

train_dest = '/content/gdrive/MyDrive/ASP360Project/Data/train.csv'
val_dest = '/content/gdrive/MyDrive/ASP360Project/Data/val.csv'
test_dest = '/content/gdrive/MyDrive/ASP360Project/Data/test.csv'

split_data(no_duplicates_source, train_dest, val_dest, test_dest)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
#changing the text format from a paragraph to a list of words

train_data = csv_to_list(train_dest)

test_data = csv_to_list(test_dest)

val_data = csv_to_list('/content/gdrive/MyDrive/ASP360Project/Data/val.csv')


In [None]:
#find the average word count of the essays

average_word_count =( average_length(train_data) + average_length(val_data) + average_length(test_data) )/3

print(average_word_count)

360.61209888570903


In [None]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

32846
32846
32846
