In [1]:
import pandas as pd
import numpy as np
import torch
from collections import Counter
import random
import scipy.stats
from sklearn.preprocessing import MinMaxScaler
import joblib

In [2]:
data = pd.read_csv('/data/kpusteln/Fetal-RL/data_preparation/outputs/biometry_all_clean.csv')
frames = pd.read_csv('/data/kpusteln/Fetal-RL/data_preparation/outputs/frames_n.csv')

In [3]:
frames

Unnamed: 0,video,frames_n
0,529_3,551
1,418_3,543
2,418_2,543
3,58_3,516
4,66_3,516
...,...,...
1511,641_2,22
1512,647_2,20
1513,43_3,16
1514,651_3,15


In [4]:
merged = pd.merge(data, frames, on='video')

In [5]:
merged_cut = merged[merged['frames_n']>32]

In [6]:
merged_cut

Unnamed: 0,index,Class,video,measure,ps,frames_n
0,1_1_1,1,1_1,20.19,0.096716,68
1,1_1_2,1,1_1,20.19,0.096716,68
2,1_1_3,1,1_1,20.19,0.096716,68
3,1_1_4,1,1_1,20.19,0.096716,68
4,1_1_5,1,1_1,20.19,0.096716,68
...,...,...,...,...,...,...
226430,706_3_161,6,706_3,4.56,0.217095,165
226431,706_3_162,6,706_3,4.56,0.217095,165
226432,706_3_163,6,706_3,4.56,0.217095,165
226433,706_3_164,6,706_3,4.56,0.217095,165


In [7]:
def probability_mass(data):
    
    counts = Counter(data) # counting the classes
    total = sum(counts.values()) # total number of classes
    probability_mass = {k:v/total for k,v in counts.items()} # probability mass of the classes
    probability_mass = list(probability_mass.values()) # converting the dictionary to a list
    return probability_mass
    

def train_test_split(data, train_size = 0.7, precision = 0.005):
    """splitting data into train and test sets keeping the same distribution of classes using wasertein's method
    args: data - data frame containing the data
    train_size - size of the train set default
    precision - determines how close the train set size is to the train_size default 0.005 (the smaller the better, but it may take longer to generate sets)"""
    
    print('Splitting data into train and test sets...')
    
    #data = pd.read_csv(data) # loading the data
    wass_dist = 1
    videos = list(data['video'].unique()) # list of videos
    train_size = int(train_size * len(videos)) # calculating the number of videos in the train set
    while wass_dist > precision: # while the wasserstein distance is greater than 0.005
        train = random.sample(videos, train_size) # sampling the train set
        test = [x for x in videos if x not in train] # sampling the test set
        train_set = data.loc[data['video'].isin(train)] # creating the train set
        test_set = data.loc[data['video'].isin(test)] # creating the test set
        probability_mass_train = probability_mass(train_set['video']) # calculating the probability mass of the train set
        probability_mass_test = probability_mass(test_set['video']) # calculating the probability mass of the test set
        wass_dist = scipy.stats.wasserstein_distance(probability_mass_train, probability_mass_test) # wasserstein distance between distributions
    #train_set.to_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/biometry_train.csv', index = False) # saving the train set
    #test_set.to_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/biometry_val.csv', index = False) # saving the test set
    
    print('Done!')
    return train_set, test_set

In [8]:
train_set, test_set = train_test_split(merged_cut, train_size = 0.7, precision = 0.005)

Splitting data into train and test sets...
Done!


In [9]:
val_set, test_set = train_test_split(test_set, train_size = 0.5, precision = 0.005)

Splitting data into train and test sets...
Done!


In [10]:
test_set

Unnamed: 0,index,Class,video,measure,ps,frames_n
0,1_1_1,1,1_1,20.19,0.096716,68
1,1_1_2,1,1_1,20.19,0.096716,68
2,1_1_3,1,1_1,20.19,0.096716,68
3,1_1_4,1,1_1,20.19,0.096716,68
4,1_1_5,1,1_1,20.19,0.096716,68
...,...,...,...,...,...,...
226091,703_3_54,5,703_3,6.16,0.180680,58
226092,703_3_55,5,703_3,6.16,0.180680,58
226093,703_3_56,5,703_3,6.16,0.180680,58
226094,703_3_57,5,703_3,6.16,0.180680,58


In [23]:
train_set['measure_scaled'] = train_set['measure']/train_set['ps']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
measure_scaled = train_set['measure_scaled']

In [12]:
scaler = MinMaxScaler()
scaler.fit(measure_scaled.values.reshape(-1,1))

MinMaxScaler()

In [25]:
train_set['measure_scaled'] = scaler.transform(train_set['measure_scaled'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
test_set['measure_scaled'] = test_set['measure']/test_set['ps']

In [14]:
val_set['measure_scaled'] = val_set['measure']/val_set['ps']

In [16]:
val_set['measure_scaled'] = scaler.transform(val_set['measure_scaled'].values.reshape(-1,1))

In [17]:
test_set['measure_scaled'] = scaler.transform(test_set['measure_scaled'].values.reshape(-1,1))

In [20]:
joblib.dump(scaler, 'scaler_filename')

['scaler_filename']

In [29]:
train_set.to_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/biometry_train.csv', index = False) # saving the train set
test_set.to_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/biometry_test.csv', index = False) # saving the test set
val_set.to_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/biometry_val.csv', index = False) # saving the test set


In [30]:
train_set

Unnamed: 0,index,Class,video,measure,ps,frames_n,measure_scaled
431,5_1_1,1,5_1,17.10,0.102237,83,0.740481
432,5_1_2,1,5_1,17.10,0.102237,83,0.740481
433,5_1_3,1,5_1,17.10,0.102237,83,0.740481
434,5_1_4,1,5_1,17.10,0.102237,83,0.740481
435,5_1_5,1,5_1,17.10,0.102237,83,0.740481
...,...,...,...,...,...,...,...
226430,706_3_161,6,706_3,4.56,0.217095,165,0.049686
226431,706_3_162,6,706_3,4.56,0.217095,165,0.049686
226432,706_3_163,6,706_3,4.56,0.217095,165,0.049686
226433,706_3_164,6,706_3,4.56,0.217095,165,0.049686


In [2]:
train_set = pd.read_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/biometry_train.csv')
test_set = pd.read_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/biometry_test.csv')
val_set = pd.read_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/biometry_val.csv')

In [10]:
train_videos = pd.DataFrame(train_videos)
test_videos = pd.DataFrame(test_videos)
val_videos = pd.DataFrame(val_videos)

In [14]:
train_videos.to_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/train_videos.csv', index = False)
test_videos.to_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/test_videos.csv', index = False)
val_videos.to_csv('/data/kpusteln/Fetal-RL/data_preparation/data_biometry/val_videos.csv', index = False)

In [17]:
train_set

Unnamed: 0,index,Class,video,measure,ps,frames_n,measure_scaled
0,5_1_1,1,5_1,17.10,0.102237,83,0.740481
1,5_1_2,1,5_1,17.10,0.102237,83,0.740481
2,5_1_3,1,5_1,17.10,0.102237,83,0.740481
3,5_1_4,1,5_1,17.10,0.102237,83,0.740481
4,5_1_5,1,5_1,17.10,0.102237,83,0.740481
...,...,...,...,...,...,...,...
158196,706_3_161,6,706_3,4.56,0.217095,165,0.049686
158197,706_3_162,6,706_3,4.56,0.217095,165,0.049686
158198,706_3_163,6,706_3,4.56,0.217095,165,0.049686
158199,706_3_164,6,706_3,4.56,0.217095,165,0.049686


In [19]:
vid = train_set['video'][0]
vid

'5_1'

In [21]:
train_videos.query('video == @vid')['frames_n'].iloc[0]

UndefinedVariableError: name 'video' is not defined