In [37]:
import os
import pandas as pd

from dataset_helper_functions import *
from definitions import *


In [38]:
header = ['i', 'id', 'src', 'content', 'label']

In [39]:
combine_debates(prepend_title='after_leak')

In [40]:
train_path = os.path.join(POLIT_DATA_DIR_PATH, 'train', 'full_train_combined.tsv')
orig_train = pd.read_csv(train_path, sep='\t', index_col=False, names=header)

train_sub_val_path = os.path.join(POLIT_DATA_DIR_PATH, 'train', 'train_sub_valid_combined.tsv')
train_sub_val = pd.read_csv(train_sub_val_path, sep='\t', index_col=False, names=header)

val_path = os.path.join(POLIT_DATA_DIR_PATH, 'val', 'val_combined.tsv')
val = pd.read_csv(val_path, sep='\t', index_col=False, names=header)

test_path = os.path.join(POLIT_DATA_DIR_PATH, 'test', 'test_combined.tsv')
test = pd.read_csv(test_path, sep='\t', index_col=False, names=header)

train_path_weak = os.path.join(POLIT_DATA_DIR_PATH, 'train_weak')


data_splits = {
    'orig_train': {'df': orig_train},
    'train_sub_val': {'df': train_sub_val},
    'test': {'df': test},
    'val': {'df': val}
}

In [41]:
# orig_train[orig_train['id'].isin(val['id'])]
train_sub_val[train_sub_val['id'].isin(val['id'])]

Unnamed: 0,i,id,src,content,label


In [42]:
val_p = val[val['label'] == 1]

print(len(val_p) / len(val))

train_sub_val_p = train_sub_val[train_sub_val['label'] == 1]
print(len(train_sub_val_p) / len(train_sub_val))

0.029056312676780664
0.02803257608229747


Full train dataset analysis

In [43]:


for dsplit in data_splits:
    print(dsplit)
    df = data_splits[dsplit]['df']

    df['label'] = df['label'].astype(int)
    df_len = len(df)
    data_splits[dsplit]['len'] = df_len
    print('len: ', df_len)

    df_p = df[df['label'] == 1]
    df_p_len = len(df_p)
    p_dict = {
        'len': df_p_len,
        'ratio': df_p_len / df_len
    }
    data_splits[dsplit]['p'] = p_dict
    print('p len: ', p_dict['len'])
    print('p ratio: ', p_dict['ratio'])

    df_n = df[df['label'] == 0]
    df_n_len = len(df_n)
    n_dict = {
        'len': df_n_len,
        'ratio': df_n_len / df_len
    }
    data_splits[dsplit]['n'] = n_dict
    print('n len: ', n_dict['len'])
    print('n ratio: ', n_dict['ratio'])

print(data_splits)

orig_train
len:  15554
p len:  440
p ratio:  0.028288543140028287
n len:  15114
n ratio:  0.9717114568599717
train_sub_val
len:  11665
p len:  327
p ratio:  0.02803257608229747
n len:  11338
n ratio:  0.9719674239177025
test
len:  6478
p len:  136
p ratio:  0.020994133991972832
n len:  6342
n ratio:  0.9790058660080272
val
len:  3889
p len:  113
p ratio:  0.029056312676780664
n len:  3776
n ratio:  0.9709436873232193
{'orig_train': {'df':          i           id      src  \
0        1    201602261  BLITZER   
1        2    201602262  BLITZER   
2        3    201602263  BLITZER   
3        4    201602264   CARSON   
4        5    201602265   CARSON   
...    ...          ...      ...   
15549  406  20170928406    TRUMP   
15550  407  20170928407    TRUMP   
15551  408  20170928408    TRUMP   
15552  410  20170928410    TRUMP   
15553  411  20170928411    TRUMP   

                                                 content  label  
0      It's time for the candidates to introduce them...  

Train after validation split analysis

In [44]:
# combined_labels = pd.merge()
import seaborn as sns

temp = {k: {kk: vv for kk, vv in v.items() if kk != 'df'} for k, v in data_splits.items()}

top_keys = [val for val in list(temp.keys()) for _ in range(2)]

inner_keys = ['worthy', 'unworthy']*len(temp.keys())
values = [val['len'] for inner in temp.values() for val in inner.values() if isinstance(val, dict)]
data = list(zip(top_keys, inner_keys, values))

# print()
cols = ['dataset', 'label', 'count']
# print()
df = pd.DataFrame(data=data, columns=cols)
print(df)
import plotly.express as px
# df = pd.DataFrame(list(zip(values[0:-1:2], values[1::2]))).T
# df.columns = top_keys[:len(top_keys)//2]
# df.index = inner_keys[:2]
# df
fig = px.histogram(
    df,
    x='dataset', y='count',
    color='label', barmode='group',
    height=400
)
fig.show()

         dataset     label  count
0     orig_train    worthy    440
1     orig_train  unworthy  15114
2  train_sub_val    worthy    327
3  train_sub_val  unworthy  11338
4           test    worthy    136
5           test  unworthy   6342
6            val    worthy    113
7            val  unworthy   3776


In [45]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Define color sets of paintings
night_colors = ['rgb(56, 75, 126)', 'rgb(18, 36, 37)', 'rgb(34, 53, 101)',
                'rgb(36, 55, 57)', 'rgb(6, 4, 4)']
irises_colors = ['rgb(33, 75, 99)', 'rgb(79, 129, 102)', 'rgb(151, 179, 100)',
                 'rgb(175, 49, 35)', 'rgb(36, 73, 147)']


# Create subplots, using 'domain' type for pie charts
# datasets = ['original_train', 'train_w/o_val', 'test', 'validation']
datasets = top_keys[::2]
labels = ['worthy', 'unworthy']
specs = [[{'type':'domain'}]*2]*2
positions = list(zip([1, 1, 2, 2], [1, 2]*2))
print(positions)
fig = make_subplots(
    rows=2, cols=2,
    specs=specs,
    subplot_titles=[
        'Training Data',
        'Training Data Minus the Validation Split',
        'Test Data',
        'Validation Split'
    ],
    horizontal_spacing=0.1,
    vertical_spacing=0.15
)

# # Define pie charts
for i, dataset in enumerate(datasets):
    fig.add_trace(
        go.Pie(
            name=dataset,
            labels=labels,
            values=[temp[dataset][l]['len'] for l in ['p', 'n']],
            pull=[0.2, 0],
            scalegroup='one',
            textinfo='label+percent+value',
        ),
        *positions[i]
    )

# # Tune layout and hover info
fig.update_traces(hoverinfo='label+percent')
fig.update(layout_title_text=None,#'Class imbalance in political debates data.',
           layout_showlegend=False, layout_height=700, layout_width=900)

for annotation in fig['layout']['annotations']:
    annotation['y'] -= 0.48

# print(fig['layout']['annotations'])
fig = go.Figure(fig)
fig.show()

[(1, 1), (1, 2), (2, 1), (2, 2)]


In [46]:
# big_df = pd.DataFrame()
# for fn in os.listdir(train_path_weak):
#     fp = os.path.join(train_path_weak, fn)

    
#     big_df = big_df.append(df)
#     # long_src = df[df['src'].str.len() > p75]
#     # break
# big_df = big_df.reset_index(drop=True)
# # long_src
# big_df
# combine_debates(splits=['train_weak'])
weak_path = os.path.join(train_path_weak, 'train_weak_combined.tsv')
weak_df = pd.read_csv(weak_path, sep='\t', names=['i', 'id','src', 'content', 'label', 'score'], index_col=False)



In [47]:
# weak_df['label']
weak_p = weak_df[weak_df['label'] == 1]
weak_p_len = len(weak_p)
weak_p_len / len(weak_df)
weak_n = weak_df[weak_df['label'] == 0]
weak_n_len = len(weak_n)

print('p: ', weak_p_len / len(weak_df))
print('n: ', 1 - weak_p_len / len(weak_df))
print(len(weak_df))
print(weak_p_len)
datasets = [
    'Political Debates Dataset - Training Split',
    'Political Debates Dataset - Training Split',
    'Weakly Labelled Dataset',
    'Weakly Labelled Dataset'
]
labels = ['worthy', 'unworthy']*2
print(data_splits['orig_train'])
values = [
    data_splits['orig_train']['p']['len'],
    data_splits['orig_train']['n']['len'],
    weak_p_len,
    weak_n_len
]
data = list(zip(datasets, inner_keys, values))
# print()
cols = ['dataset', 'label', 'count']
# print()
df = pd.DataFrame(data=data, columns=cols)
print(df)
# pd.DataFrame(data=)

p:  0.20463029900804702
n:  0.7953697009919529
168758
34533
{'df':          i           id      src  \
0        1    201602261  BLITZER   
1        2    201602262  BLITZER   
2        3    201602263  BLITZER   
3        4    201602264   CARSON   
4        5    201602265   CARSON   
...    ...          ...      ...   
15549  406  20170928406    TRUMP   
15550  407  20170928407    TRUMP   
15551  408  20170928408    TRUMP   
15552  410  20170928410    TRUMP   
15553  411  20170928411    TRUMP   

                                                 content  label  
0      It's time for the candidates to introduce them...      0  
1                           You'll each have 30 seconds.      0  
2                              Dr. Carson, you're first.      0  
3      If someone had tried to describe today's Ameri...      0  
4      Americans know that our nation is heading off ...      0  
...                                                  ...    ...  
15549     If you demand it, the politi

In [59]:
labels = ['worthy', 'unworthy']
specs = [[{'type':'domain'}]*2]

datasets = [
    'Political Debates Dataset - Training Split',
    'Weakly Labelled Dataset',
]

positions = [(1, 1), (1, 2)]

fig = make_subplots(rows=1, cols=2, specs=specs, subplot_titles=datasets, horizontal_spacing=0.1)
pulls = [[0.2, 0], [0.05, 0]]
# # Define pie charts
for i, dataset in enumerate(datasets):
    fig.add_trace(
        go.Pie(
            name=dataset,
            labels=labels,
            values=df[df['dataset'] == dataset]['count'].values,
            pull=pulls[i],
            scalegroup='one',
            textinfo='label+percent+value',
        ),
        *positions[i]
    )

# # Tune layout and hover info
fig.update_traces(hoverinfo='label+percent')
fig.update(layout_title_text=None, #'Class imbalance and size comparison of Political Debates and Weakly Labelled datasets.'
           layout_showlegend=False, layout_height=600, layout_width=900)

for annotation in fig['layout']['annotations']:
    annotation['y'] -= 1.125

# print(fig['layout']['annotations'])
fig = go.Figure(fig)
fig.show()