In [None]:
!pip install -U datasets

In [None]:
import re

from datasets import load_dataset
import plotly.express as px
import seaborn as sns
import pandas as pd

In [None]:
data = load_dataset('ju-resplande/qa-pt')

- id

In [None]:
faq = pd.DataFrame()
faq['id'] = pd.Series(data['train']['id']).str.replace(r'-.*', '', regex=True)
faq['domain'] = pd.Series(data['train']['domain'])
faq = faq.value_counts().reset_index().rename(columns={0: 'count'})
faq['bins'] = pd.cut(faq['count'], range(0, 230, 10))

In [None]:
faq

In [None]:
faq[faq['count'] == 1].count()

In [None]:
faq['bins'].value_counts()

In [None]:
faq[faq['count'] > 100]

- domain

In [None]:
domains = pd.Series(data['train']['domain'])
domains = domains.value_counts().reset_index().rename(columns={0: 'count'})
domains['bins'] = pd.cut(domains['count'], [10 ** i for i in range(7)], include_lowest=True)
domains['domain'] = domains['index'].str.replace(r'.+\.', '', regex=True)

In [None]:
domains

In [None]:
domains['bins'].value_counts()

In [None]:
domains['domain'].value_counts()

In [None]:
domains[domains['count'] > 100000]

- NaNs

In [None]:
na = pd.DataFrame()

for column in ['question_text', 'answer_title']:
    na[column] = pd.Series(data['train'][column])
    na[column] = na[column] != '-'

In [None]:
na.value_counts()

- balancing

In [None]:
label = pd.Series(data['train']['is_accepted'])

- sizes

In [None]:
sizes = pd.DataFrame()

for column in ['question_title', 'question_text', 'answer_title', 'answer_text']:
    sizes[column] = pd.Series(data['train'][column])
    sizes[f'n_{column}'] = sizes[column].apply(lambda s: len(re.split(r'\s+', s.strip())) if not pd.isna(s) else 0)

In [None]:
sizes.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.99]).applymap(lambda x: f"{x:0.3f}")

In [None]:
px.histogram(
    sizes, 
    x=['n_question_title', 'n_answer_text'],
    log_y=True,
)

In [None]:
px.histogram(
    sizes, 
    x=['n_question_title', 'n_answer_text'],
    range_x=[0, 240] 
)

In [None]:
sizes[sizes['n_question_text'] > 1]['n_question_text'].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.99]).apply(lambda x: f"{x:0.3f}")

In [None]:
sizes[sizes['n_answer_title'] > 1]['n_answer_title'].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.99]).apply(lambda x: f"{x:0.3f}")

In [None]:
df = sizes #.copy() save RAM

for column in ['domain', 'is_accepted']:
    df[column] = data['train'][column]

for column in ['question_text', 'answer_title']:
    df[f'has_{column}'] = df[f'n_{column}'] > 1

In [None]:
df.drop(columns=["n_question_title", "n_question_text", "n_answer_title", "n_answer_text"])

In [None]:
corr_num = df[["n_question_title", "n_question_text", "n_answer_title", "n_answer_text"]].corr()
corr_num

In [None]:
bool_df = df[["is_accepted", "has_question_text", "has_answer_title"]]
bool_df.corr()

In [None]:
count = df.pivot_table(index='has_question_text',columns='is_accepted',aggfunc='size',fill_value=0)
sns.heatmap(count, annot=True, fmt='d', cmap='Blues')