## Imports

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Data loading

In [None]:
def read_file(filename: str) -> pd.DataFrame:
    return pd.DataFrame([
        (l.split()[0], l.split()[1], ' '.join(l.split()[3:])) 
        for l in open(filename)
        ], columns=['class', 'sent', 'text']
    )

In [None]:
df_train = read_file('train.txt')
df_test = read_file('test.txt')

len(df_train), len(df_test)

## Charts

In [None]:
# sns_palette = sns.color_palette("tab10")
sns_palette = sns.color_palette(sns.hls_palette(6))
palette = {color: rgb for color, rgb in zip(df_train['class'].unique(), sns_palette)}
sns.palplot(palette.values())
palette

In [None]:
df_train['sent'].value_counts(), df_test['sent'].value_counts()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(11, 3))

ax[0].title.set_text('Train Set')
sns.countplot(data=df_train, x='class', ax=ax[0], palette=palette)

ax[1].title.set_text('Test Set')
sns.countplot(data=df_test, x='class', ax=ax[1], palette=palette)

In [None]:
_df = df_train.append(df_test).copy().reset_index()

fig, ax = plt.subplots(1, 2, figsize=(11, 4))

ax[0].title.set_text('Text Length')
_df['text_len'] = _df['text'].apply(len)
ax[0].set_xlim(0, 7000)
ax[0].set_xlim(0, 5000)
sns.histplot(data=_df, x='text_len', y='class', hue='class', legend=False, ax=ax[0], palette=palette)
ax[0].set_xlabel('length')
ax[0].set_ylabel(None)

# ax[1].title.set_text('Tokens Numer')
# _df['text_tokens_number'] = _df['text'].apply(str.split).apply(len)
# ax[1].set_xlim(0, 1000)
# sns.histplot(data=_df, x='text_tokens_number', y='class', hue='class', legend=False, ax=ax[1], palette=palette)
# ax[1].set_xlabel('num')
# ax[1].set_ylabel(None)

ax[1].title.set_text('Tokens Length')
_df['text_tokens_length'] = _df['text'].apply(str.split).apply(lambda x: np.mean([len(i) for i in x]))
ax[1].set_xlim(2, 6.5)
sns.histplot(data=_df, x='text_tokens_length', y='class', hue='class', legend=False, ax=ax[1], palette=palette)
ax[1].set_xlabel('length')
ax[1].set_ylabel(None)