### Data Split
This notebook splits the original raw data into three sets for training, validation and test

The original data and csv file must be located in './data/raw_data'

Folders 'data/train', 'data/validation', 'data/test' need to be created prior to the execution of this notebook

In [78]:
import pandas as pd
from shutil import copyfile
from sklearn.model_selection import train_test_split

In [79]:
path = "."+os.sep+"data"+os.sep
file = path + "raw_data"+os.sep+"metadata.csv"
df = pd.read_csv(file,sep=",")
df.head()

Unnamed: 0,observationid,mediaid,vote,content,classid,family,genus,species,author,date,location,latitude,longitude,yearinclef,observationid2014,imageid2014,learntag
0,15010,100019,3.0,Flower,8527,Asteraceae,Achillea,Achillea millefolium L.,mathieu menand,2009-4-24,Aucamville,,,PlantCLEF2014,4688.0,52287.0,Train
1,10732,100056,4.0,Flower,4516,Ranunculaceae,Ficaria,Ficaria verna Huds.,alain06,2014-9-7,Biot,43.62495,7.1044,PlantCLEF2015,,,Train
2,24005,100087,4.0,Flower,588,Asteraceae,Centaurea,Centaurea jacea L.,thierry pernot,1800-1-1,,,,PlantCLEF2014,2559.0,32864.0,Train
3,19892,100182,4.0,Flower,9930,Orchidaceae,Epipactis,Epipactis helleborine (L.) Crantz,thierry pernot,1800-1-1,,,,PlantCLEF2014,11733.0,40782.0,Train
4,36220,10019,4.0,Flower,6487,Orchidaceae,Ophrys,Ophrys apifera Huds.,jean-claude echardour,2010-6-26,Saffré,47.49689,-1.59442,PlantCLEF2014,13210.0,8080.0,Train


In [81]:
#There is no function to split the dataset in 3 so we call the split function twice, first call does 60-40%
#Second call splits the 40% set in half
#In the end we use 60% of data for training, 20% for validation and 20% for testing

#Train set 60%, Temp set 40% 
X_train, X_temp, y_train, y_temp = train_test_split(df, df.classid, 
                                                     test_size=0.4, random_state=1,
                                                     stratify=df.classid)
#Validation set 20%, Test set 20%
X_test, X_validation, y_test, y_validation = train_test_split(X_temp, X_temp.classid, 
                                                     test_size=0.5, random_state=1,
                                                     stratify=X_temp.classid)

In [82]:
#Distribution Dataset (Percentage of images each class has on the dataset)
df['classid'].value_counts(normalize=True) * 100

30269    3.425446
661      3.137594
6487     3.108808
493      2.792170
151      2.734600
6547     2.705815
30471    2.590674
5148     2.533103
6437     2.446747
8600     2.446747
967      2.417962
6521     2.389177
6418     2.331606
4516     2.216465
29957    2.158895
6425     2.158895
6535     2.130109
4034     2.130109
30162    2.101324
1286     2.014968
2394     1.986183
9930     1.957398
6512     1.957398
786      1.928613
14867    1.899827
588      1.899827
565      1.871042
576      1.871042
4369     1.813472
8527     1.784686
8534     1.755901
4026     1.727116
691      1.669545
30491    1.640760
6415     1.611975
4353     1.611975
8708     1.611975
4436     1.583189
32201    1.583189
4411     1.583189
6434     1.554404
6564     1.525619
6509     1.496834
4460     1.496834
4477     1.496834
326      1.468048
1047     1.468048
6538     1.410478
2430     1.381693
6448     1.381693
Name: classid, dtype: float64

In [83]:
#We calculate the distribution over each split to make sure we kept more or less the same proportions
#across all sets, it's near impossible to keep exactly the same proportions as percentages can give us
#decimal values 

#Distribution Train Set
X_train['classid'].value_counts(normalize=True) * 100

30269    3.406910
661      3.119002
6487     3.119002
493      2.783109
151      2.735125
6547     2.687140
30471    2.591171
5148     2.543186
6437     2.447217
8600     2.447217
967      2.399232
6521     2.399232
6418     2.303263
4516     2.207294
29957    2.159309
6425     2.159309
6535     2.111324
30162    2.111324
4034     2.111324
1286     2.015355
9930     1.967370
2394     1.967370
6512     1.967370
14867    1.919386
786      1.919386
588      1.919386
576      1.871401
565      1.871401
4369     1.823417
8527     1.775432
8534     1.775432
4026     1.727447
691      1.679463
8708     1.631478
30491    1.631478
6415     1.631478
4353     1.631478
32201    1.583493
4436     1.583493
4411     1.583493
6564     1.535509
6434     1.535509
4477     1.487524
6509     1.487524
4460     1.487524
326      1.487524
1047     1.487524
2430     1.391555
6538     1.391555
6448     1.391555
Name: classid, dtype: float64

In [84]:
#Distribution Validation Set
X_validation['classid'].value_counts(normalize=True) * 100

30269    3.453237
6487     3.165468
661      3.165468
493      2.877698
151      2.733813
6547     2.733813
5148     2.589928
30471    2.589928
967      2.446043
6418     2.446043
6437     2.446043
8600     2.446043
6521     2.302158
6535     2.158273
29957    2.158273
6425     2.158273
30162    2.158273
4034     2.158273
4516     2.158273
2394     2.014388
6512     2.014388
1286     2.014388
786      1.870504
14867    1.870504
588      1.870504
576      1.870504
565      1.870504
9930     1.870504
8527     1.726619
4369     1.726619
30491    1.726619
4026     1.726619
8534     1.726619
4436     1.582734
4460     1.582734
32201    1.582734
6564     1.582734
4353     1.582734
6415     1.582734
6509     1.582734
691      1.582734
6434     1.582734
8708     1.582734
4411     1.582734
326      1.438849
4477     1.438849
6538     1.438849
1047     1.438849
2430     1.294964
6448     1.294964
Name: classid, dtype: float64

In [85]:
#Distribution Test Set
X_test['classid'].value_counts(normalize=True) * 100

30269    3.453237
661      3.165468
6487     3.021583
151      2.733813
6547     2.733813
493      2.733813
30471    2.589928
967      2.446043
5148     2.446043
6437     2.446043
8600     2.446043
6521     2.446043
4516     2.302158
6418     2.302158
4034     2.158273
29957    2.158273
6535     2.158273
6425     2.158273
9930     2.014388
2394     2.014388
30162    2.014388
786      2.014388
1286     2.014388
8527     1.870504
6512     1.870504
565      1.870504
14867    1.870504
588      1.870504
576      1.870504
4369     1.870504
4026     1.726619
8534     1.726619
691      1.726619
4436     1.582734
8708     1.582734
4411     1.582734
30491    1.582734
32201    1.582734
6434     1.582734
4477     1.582734
4353     1.582734
6415     1.582734
6564     1.438849
6538     1.438849
4460     1.438849
2430     1.438849
6509     1.438849
6448     1.438849
326      1.438849
1047     1.438849
Name: classid, dtype: float64

In [86]:
# We create a new csv for each set 
X_train.to_csv(path + "train"+os.sep+"train.csv", index=False)
X_validation.to_csv(path + "validation"+os.sep+"validation.csv", index=False)
X_test.to_csv(path + "test"+os.sep"test.csv", index=False)

In [87]:
# We create a copy of the photos in the respective set folder, this wasn't really needed but we thought it
# was easier for us to keep a different folder for each set

def copy_photos(df, setname):
    for photo in df['mediaid']:
        copyfile(path+"raw_data"+os.sep+str(photo)+".jpg", path+setname+os.sep+str(photo)+".jpg")

In [88]:
copy_photos(X_test,"test")
copy_photos(X_validation,"validation")
copy_photos(X_train,"train")