# Small datasets partition

In [52]:
import pandas as pd

In [53]:
PATH = '/home/ines/Documents/tese/tiny_gp/data'
DATASET = 'USCrime'

In [54]:
df = pd.read_csv(f'{PATH}/{DATASET}.csv', sep='\t')

df.columns = cols = [f'x{i}' for i in range(len(df.columns) - 1)] + ['Target']

df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,Target
0,79.099998,151.0,1.0,91.0,58.0,56.0,510.0,950.0,33.0,301.0,108.0,41.0,394.0,261.0
1,163.5,143.0,0.0,113.0,103.0,95.0,583.0,1012.0,13.0,102.0,96.0,36.0,557.0,194.0
2,57.799999,142.0,1.0,89.0,45.0,44.0,533.0,969.0,18.0,219.0,94.0,33.0,318.0,250.0
3,196.899994,136.0,0.0,121.0,149.0,141.0,577.0,994.0,157.0,80.0,102.0,39.0,673.0,167.0
4,123.400002,141.0,0.0,121.0,109.0,101.0,591.0,985.0,18.0,30.0,91.0,20.0,578.0,174.0
5,68.199997,121.0,0.0,110.0,118.0,115.0,547.0,964.0,25.0,44.0,84.0,29.0,689.0,126.0
6,96.300003,127.0,1.0,111.0,82.0,79.0,519.0,982.0,4.0,139.0,97.0,38.0,620.0,168.0
7,155.5,131.0,1.0,109.0,115.0,109.0,542.0,969.0,50.0,179.0,79.0,35.0,472.0,206.0
8,85.599998,157.0,1.0,90.0,65.0,62.0,553.0,955.0,39.0,286.0,81.0,28.0,421.0,239.0
9,70.5,140.0,0.0,118.0,71.0,68.0,632.0,1029.0,7.0,15.0,100.0,24.0,526.0,174.0


In [55]:
nobs = len(df)

train_obs = int(nobs * 0.7)
test_obs = nobs - train_obs

for i in range(1, 31):
    train_df = df.sample(train_obs).reset_index(drop=True)
    test_df = df.drop(train_df.index).reset_index(drop=True)

    train_df.to_csv(f'{PATH}/{DATASET}/train_{i}.csv', index=True)
    test_df.to_csv(f'{PATH}/{DATASET}/test_{i}.csv', index=True)

# SMOTE

In [71]:
PATH = '/home/ines/Documents/tese/tiny_gp/data'
DATASET = 'USCrime'

In [72]:
from imblearn.over_sampling import SMOTE

In [73]:
df = pd.read_csv(f'{PATH}/{DATASET}/train_1.csv', index_col='Unnamed: 0')

In [74]:
df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,Target
0,68.199997,121.0,0.0,110.0,118.0,115.0,547.0,964.0,25.0,44.0,84.0,29.0,689.0,126.0
1,96.800003,131.0,0.0,116.0,78.0,73.0,574.0,1038.0,7.0,36.0,142.0,42.0,540.0,176.0
2,121.599998,132.0,0.0,96.0,87.0,83.0,564.0,953.0,43.0,92.0,83.0,32.0,513.0,227.0
3,56.599998,133.0,0.0,104.0,51.0,47.0,599.0,1024.0,7.0,40.0,99.0,27.0,425.0,225.0
4,74.199997,126.0,0.0,108.0,74.0,67.0,602.0,984.0,34.0,12.0,102.0,33.0,557.0,195.0
5,45.5,139.0,1.0,88.0,46.0,41.0,480.0,968.0,19.0,49.0,135.0,53.0,457.0,249.0
6,122.5,125.0,0.0,108.0,113.0,105.0,567.0,985.0,78.0,94.0,130.0,58.0,626.0,166.0
7,75.0,130.0,0.0,116.0,128.0,128.0,536.0,934.0,51.0,24.0,78.0,34.0,627.0,135.0
8,84.900002,134.0,0.0,108.0,75.0,71.0,595.0,972.0,47.0,59.0,83.0,31.0,580.0,172.0
9,79.800003,152.0,1.0,87.0,57.0,53.0,530.0,986.0,30.0,72.0,92.0,43.0,405.0,264.0


In [75]:
import numpy as np

OVERSAMPLING_FACTOR = 2

for idx in range(1, 31):

    df1 = pd.read_csv(f'{PATH}/{DATASET}/train_{idx}.csv', index_col='Unnamed: 0')
    df2 = df1.copy()

    df = pd.concat([df1, df2, df2], ignore_index=True)

    target = df['Target']
    df = df.drop(columns=['Target'])

    y = np.array([1 for _ in range(len(df1))] + [0 for _ in range(OVERSAMPLING_FACTOR*len(df2))]).reshape(-1, 1)

    print('Number of observations', pd.Series(y.reshape(-1), name = 'Target').value_counts())

    sm = SMOTE(random_state=42)

    df_res, y_res = sm.fit_resample(df, y)

    print('Number of observations', pd.Series(y_res.reshape(-1), name = 'Target').value_counts())

    df_res['Target'] = y_res

    df_final = df_res[df_res['Target'] == 1].reset_index(drop=True)

    df_final.to_csv(f'{PATH}/{DATASET}/oversampled_{idx}.csv')

Number of observations Target
0    64
1    32
Name: count, dtype: int64
Number of observations Target
1    64
0    64
Name: count, dtype: int64
Number of observations Target
0    64
1    32
Name: count, dtype: int64
Number of observations Target
1    64
0    64
Name: count, dtype: int64
Number of observations Target
0    64
1    32
Name: count, dtype: int64
Number of observations Target
1    64
0    64
Name: count, dtype: int64
Number of observations Target
0    64
1    32
Name: count, dtype: int64
Number of observations Target
1    64
0    64
Name: count, dtype: int64
Number of observations Target
0    64
1    32
Name: count, dtype: int64
Number of observations Target
1    64
0    64
Name: count, dtype: int64
Number of observations Target
0    64
1    32
Name: count, dtype: int64
Number of observations Target
1    64
0    64
Name: count, dtype: int64
Number of observations Target
0    64
1    32
Name: count, dtype: int64
Number of observations Target
1    64
0    64
Name: count, dtype

# Subsample dataset

In [97]:
PATH = '/home/ines/Documents/tese/tiny_gp/data'
DATASET = 'BHousing'

In [98]:
df = pd.read_csv(f'{PATH}/{DATASET}/train_1.csv', index_col='Unnamed: 0')
df

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,Target
0,1.41385,0.0,19.58,1,0.8710,6.129,96.0,1.7494,5,403,14.7,321.02,15.12,17.0
1,5.66998,0.0,18.10,1,0.6310,6.683,96.8,1.3567,24,666,20.2,375.33,3.73,50.0
2,0.06129,20.0,3.33,1,0.4429,7.645,49.7,5.2119,5,216,14.9,377.07,3.01,46.0
3,0.25356,0.0,9.90,0,0.5440,5.705,77.7,3.9450,4,304,18.4,396.42,11.50,16.2
4,2.01019,0.0,19.58,0,0.6050,7.929,96.2,2.0459,5,403,14.7,369.30,3.70,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,0.13158,0.0,10.01,0,0.5470,6.176,72.5,2.7301,6,432,17.8,393.30,12.04,21.2
350,3.47428,0.0,18.10,1,0.7180,8.780,82.9,1.9047,24,666,20.2,354.55,5.29,21.9
351,12.04820,0.0,18.10,0,0.6140,5.648,87.6,1.9512,24,666,20.2,291.55,14.10,20.8
352,8.05579,0.0,18.10,0,0.5840,5.427,95.4,2.4298,24,666,20.2,352.58,18.14,13.8


In [99]:
import numpy as np

SUBSAMPLING_FACTOR = 2

for idx in range(1, 31):

    num_obs = int(len(df)/SUBSAMPLING_FACTOR)

    df = pd.read_csv(f'{PATH}/{DATASET}/train_{idx}.csv', index_col='Unnamed: 0')

    print('Number of observations', len(df))

    subsampled_df = df.sample(num_obs).reset_index(drop=True)

    print('Number of observations', len(subsampled_df))

    subsampled_df.to_csv(f'{PATH}/{DATASET}/subsampled_{idx}.csv')

Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
Number of observations 177
Number of observations 354
N