# Dataset Divider

*   Jacob Yousif

## Importing the libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%%capture
!pip install imbalanced-learn

In [3]:
%%capture
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

## Loading the data

In [4]:
file_path = 'Datasets/ProcessedSegmentedLiterature.csv'
df = pd.read_csv(file_path)
df= df.sort_values(by='AuthorCode', ascending=True)

## The processing

In [5]:
x_columns = [ 
    "AverageWordLength", "AverageSentenceLength", "AverageSyllablePerWord",
    "PunctuationCount", "FunctionalWordsCount", "TypeTokenRatio", "HonoreMeasureR",
    "Hapax", "SichelesMeasureS", "Dihapax", "YulesCharacteristicK", "SimpsonsIndex",
    "BrunetsMeasureW", "ShannonEntropy", "FleschReadingEase", "FleschKincaidGradeLevel",
    "DaleChallReadability", "GunningFog"
]
y_column = ['AuthorCode']

In [6]:
X = df[x_columns].values

In [7]:
y = df[y_column].values

In [8]:
smote = SMOTE()
X, y = smote.fit_resample(X, y)

In [9]:
from collections import Counter
class_distribution = Counter(y)
print("Class Distribution:")
for cls, count in class_distribution.items():
    print(f"Class {cls}: {count}")

Class Distribution:
Class 0: 11327
Class 1: 11327
Class 2: 11327
Class 3: 11327
Class 4: 11327
Class 5: 11327
Class 6: 11327
Class 7: 11327
Class 8: 11327
Class 9: 11327
Class 10: 11327
Class 11: 11327
Class 12: 11327
Class 13: 11327
Class 14: 11327
Class 15: 11327
Class 16: 11327
Class 17: 11327


In [10]:
print('The size of the dataset is:', len(X))

The size of the dataset is: 203886


In [11]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)  
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [13]:
print('The training set (Descriptive Features) size:', len(X_train))

The training set (Descriptive Features) size: 142720


In [14]:
print('The training set (Target Feature) size:', len(y_train))

The training set (Target Feature) size: 142720


In [15]:
print('The validation set (Descriptive Features) size:', len(X_val))

The validation set (Descriptive Features) size: 30583


In [16]:
print('The validation set (Target Feature) size:', len(y_val))

The validation set (Target Feature) size: 30583


In [17]:
print('The test set (Descriptive Features) size:', len(X_test))

The test set (Descriptive Features) size: 30583


In [18]:
print('The test set (Target Feature) size:', len(y_test))

The test set (Target Feature) size: 30583


In [19]:
np.savez('Datasets/LearningSet.npz', 
         X_train=X_train, y_train=y_train, 
         X_val=X_val, y_val=y_val, 
         X_test=X_test, y_test=y_test)