In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
def split_train_test(data, test_ratio):
    np.random.seed(2022011107) 
    shuffled_indices = np.random.permutation(len(data)) 
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [3]:
data = pd.read_csv('./raw_data/raw_train.csv')

In [4]:
train_set, test_set = split_train_test(data, 0.2)
print(len(train_set), len(test_set))

17118 4279


In [6]:
train_set.to_csv("./final_dataset/train.csv", index=False)
test_set.to_csv("./final_dataset/test.csv", index=False)

In [10]:
train_set.head(), test_set.head()

(             image_id  label
 10723  2909436722.jpg      4
 20125   750645140.jpg      2
 11451  3045665893.jpg      3
 16837   403051866.jpg      1
 445    1079216156.jpg      3,
              image_id  label
 1836   1323935440.jpg      3
 11038  2967863031.jpg      3
 7214   2286796247.jpg      3
 17080  4073523035.jpg      0
 4300   1768676175.jpg      3)

In [12]:
train_list = train_set.image_id.tolist()
test_list = test_set.image_id.tolist()
train_list, test_list

(['2909436722.jpg',
  '750645140.jpg',
  '3045665893.jpg',
  '403051866.jpg',
  '1079216156.jpg',
  '654782602.jpg',
  '711729648.jpg',
  '1669524287.jpg',
  '850402862.jpg',
  '3751355008.jpg',
  '348338717.jpg',
  '2460780852.jpg',
  '3609925731.jpg',
  '357852038.jpg',
  '2324206704.jpg',
  '1601382474.jpg',
  '3221163634.jpg',
  '2227901688.jpg',
  '2255058725.jpg',
  '1760111607.jpg',
  '1796303978.jpg',
  '1329083657.jpg',
  '2531944838.jpg',
  '499562675.jpg',
  '383010282.jpg',
  '431419127.jpg',
  '1123269893.jpg',
  '267223075.jpg',
  '1694570941.jpg',
  '1154479394.jpg',
  '4072724949.jpg',
  '1093539112.jpg',
  '730367123.jpg',
  '3116992469.jpg',
  '498277992.jpg',
  '2865082132.jpg',
  '84787134.jpg',
  '1948698978.jpg',
  '416297686.jpg',
  '3833575676.jpg',
  '1473359901.jpg',
  '1655292387.jpg',
  '174674584.jpg',
  '1889796065.jpg',
  '901063337.jpg',
  '1713497934.jpg',
  '3809163419.jpg',
  '4012035359.jpg',
  '4551119.jpg',
  '1759042176.jpg',
  '1870345628.jpg',
 

In [16]:
data_path = './raw_data/train_images'
train_path = './final_dataset/train_images'
test_path = './final_dataset/test_images'

In [14]:
import os
import shutil

In [17]:
for i in range(len(train_list)):
    shutil.move(os.path.join(data_path, train_list[i]), os.path.join(train_path, train_list[i]))

In [18]:
for i in range(len(test_list)):
    shutil.move(os.path.join(data_path, test_list[i]), os.path.join(test_path, test_list[i]))

In [19]:
train_count = 0
test_count = 0
# Iterate directory
for path in os.listdir(train_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(train_path, path)):
        train_count += 1
        
for path in os.listdir(test_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(test_path, path)):
        test_count += 1

In [20]:
print(train_set.shape, train_count)

(17118, 2) 17118


In [21]:
print(test_set.shape, test_count)

(4279, 2) 4279
