# Here I will show there are overlapping sentences in the train, validation and test set

In [None]:
cd ..

In [None]:
import pandas as pd
from mrec.data.dataset import load_data

In [None]:
csv_fnames = {'train': 'dataset/raw/train.csv', 'validation': 'dataset/raw/validation.csv', 'test': 'dataset/raw/test.csv'}

base_dir = '/Users/ktle2/personal_projects/mrec/models/baseline_model'
pred_csv_fnames = {'train': f'{base_dir}/train-predictions.csv',
                   'validation': f'{base_dir}/validation-predictions.csv',
                   'test': f'{base_dir}/test-predictions.csv'}

dataset = load_data(pred_csv_fnames)

In [None]:
cols = ['_unit_id', 'relation', 'sentence', 'direction', 'term1', 'term2', 'relation_pred']
train = dataset.train[cols]
validation = dataset.validation[cols]
test= dataset.test[cols]



In [None]:
train.head(5)

### Here I will show the inconsistent in labeling `relation` on sentences. I will group `_unit_id`, `relation`, `sentence`, `term1`, and `term2` and do a majority vote on `direction` to remove duplicates. Then I will show that same sentences can have different relation

In [None]:
MAJORITY_VOTE_FLAG = False
# majority vote assigned as the direction, thereby duplicates are removed
if MAJORITY_VOTE_FLAG:
    group_cols = ['_unit_id', 'relation', 'sentence', 'term1', 'term2', 'relation_pred']
    train_no_dup = train.groupby(group_cols)['direction'].agg(pd.Series.mode).reset_index()
    val_no_dup = validation.groupby(group_cols)['direction'].agg(pd.Series.mode).reset_index()
    test_no_dup = test.groupby(group_cols)['direction'].agg(pd.Series.mode).reset_index()
else:
    relation_type = ['causes', 'treats']
    train_no_dup = train[train['relation'].isin(relation_type)].drop_duplicates(subset='_unit_id')
    val_no_dup = validation[validation['relation'].isin(relation_type)].drop_duplicates(subset='_unit_id')
    test_no_dup = test[test['relation'].isin(relation_type)].drop_duplicates(subset='_unit_id')

In [None]:
train_no_dup['sentence'].nunique()

In [None]:
grouped_df = train_no_dup.groupby(['sentence']).size().reset_index(name='show-up counts')
print(grouped_df.shape)
duplicated_sentences_count = grouped_df[grouped_df['show-up counts'] > 1].reset_index(drop=True)

print(f"Number of duplicated sentences within training set: {duplicated_sentences_count.shape[0]}")
print(f"Distribution of duplicated sentences:\n{duplicated_sentences_count['show-up counts'].value_counts()}")
duplicated_sentences_count.head()

__We see that we still have duplicated sentences. Let's look close to sentence that have 3 duplicates after doing majority vote__

In [None]:
sentence = '164  Babesiosis  Treatment of BABESIOSIS   +    caused by  BABESIA MICROTI.'
train_no_dup[train_no_dup['sentence'] == sentence]

__This sentence have duplicates beucase it has different `_unit_id` and `relation`. If we do majority vote without grouping `_unit_id`, we still have sentence duplicated and have different relation. Hence this train dataset is inconsistent in labeling relation for each unique sentence__

In [None]:
dset_size = train_no_dup.shape[0]
duplicates = dset_size - train_no_dup['sentence'].nunique()
print('Number of rows after do majority vote:', dset_size)
print('Number of duplicate sentences:', duplicates)
print('Normalize: {:.2f}%'.format(duplicates / dset_size * 100))

In [None]:
false_train_predictions = train_no_dup[train_no_dup['relation'] != train_no_dup['relation_pred']]
misclassified_duplicated_sentences_total = false_train_predictions.shape[0]
print('Misclassified duplicated sentences: {}({:0.3f}%)\n'.format(misclassified_duplicated_sentences_total, misclassified_duplicated_sentences_total/dset_size*100))

print('Verifying that sentence is repeated within training sentence')
sample_sentence = 'Thus, the present data support the hypothesis that the therapeutic effects of CLOZAPINE in this primate model and perhaps in SCHIZOPHRENIA may be related at least in part to the restoration of DA tone in the prefrontal cortex.'
train_no_dup[train_no_dup['sentence'] == sentence]

In [None]:
print(train_no_dup.shape)
clean_train = train_no_dup.drop(list(train_no_dup[train_no_dup['sentence'].duplicated(False)].index))
print(clean_train.shape)

In [None]:
clean_train.shape

In [None]:
clean_train[clean_train['sentence'] == sample_sentence]

__Here is what that sentence look like in raw train set__

In [None]:
train[train['sentence'] == sentence]

__Here is how severe this case is in validation set__

In [None]:
duplicates = val_no_dup['sentence'].duplicated().sum()
dset_size = val_no_dup.shape[0]
print('Number of rows after do majority vote:', dset_size)
print('Number of duplicate sentences:', duplicates)
print('Normalize: {:.2f}%'.format(duplicates / dset_size * 100))


false_val_predictions = val_no_dup[val_no_dup['relation'] != val_no_dup['relation_pred']]
misclassified_duplicated_sentences_total = false_val_predictions.shape[0]
print('Misclassified duplicated sentences: {}({:0.3f}%)\n'.format(misclassified_duplicated_sentences_total, misclassified_duplicated_sentences_total/dset_size*100))

__Here is how severe this case is in test set__

In [None]:
duplicates = test_no_dup['sentence'].duplicated().sum()
dset_size = test_no_dup.shape[0]
print('Number of rows after do majority vote:', dset_size)
print('Number of duplicate sentences:', duplicates)
print('Normalize: {:.2f}%'.format(duplicates / dset_size * 100))


false_test_predictions = test_no_dup[test_no_dup['relation'] != test_no_dup['relation_pred']]
misclassified_duplicated_sentences_total = false_test_predictions.shape[0]
print('Misclassified duplicated sentences: {}({:0.3f}%)\n'.format(misclassified_duplicated_sentences_total, misclassified_duplicated_sentences_total/dset_size*100))


#### CasIn order to prove that there are overlapping sentences in train, validation and test set, I will do majority vote on `direction` in each set to remove duplicates. Then I will concatenate train and validation set and check for duplicate sentences. I will also concatenate train and test set and check for duplicate sentences.

In [None]:
train_and_val_dfs = [train_no_dup, val_no_dup]
train_concat_val = pd.concat(train_and_val_dfs)
train_concat_val['sentence'].duplicated().sum()

In [None]:
data = [['tom'], ['tom'], ['tom']]
df = pd.DataFrame(data, columns=['Name'])
df

In [None]:
df.duplicated().sum()