<h3 style="text-align:center; background-color:black; color:white; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; padding: 1rem 0;">Importing Libraries</h3>

In [1]:
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from tqdm.auto import tqdm

tqdm.pandas()

%matplotlib inline

<h3 style="text-align:center; background-color:black; color:white; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; padding: 1rem 0;">Importing & Processing Dataset</h3>

In [2]:
df_train = pd.read_csv('./dataset/train.csv')
df_Val = pd.read_csv('./dataset/val.csv')
df_test = pd.read_csv('./dataset/test.csv')

In [None]:
df = pd.concat([df_train, df_Val, df_test], ignore_index=True)
df.reset_index(drop=True, inplace=True)

<h3 style="text-align:center; background-color:black; color:white; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; padding: 1rem 0;">Data Statistics</h3>

In [None]:
total_docs = len(df)
print(f'Total number of documents: {total_docs}')

In [None]:
df['total_tokens'] = df['text'].progress_apply(lambda x: len(x.split()))
total_tokens = df['total_tokens'].sum()
mean_tokens = df['total_tokens'].mean()

In [None]:
print(f'Total number of tokens: {total_tokens}')
print(f'Mean number of tokens: {mean_tokens:.2f}')

In [None]:
df['total_sentences'] = df['text'].progress_apply(lambda x: len(x.split('.')))
total_sentences = df['total_sentences'].sum()
mean_sentences = df['total_sentences'].mean()

In [None]:
print(f'Total number of sentences: {total_sentences}')
print(f'Mean number of sentences: {mean_sentences:2f}')

In [None]:
df.head(1)

In [None]:
# Calculate P10, P25, P50, P75, P90, P95, PMin and PMax using a function and then implement the same function for tokens, sentences and words. don't make a table

def percentile(df, column):
    p10 = df[column].quantile(0.1)
    p25 = df[column].quantile(0.25)
    p50 = df[column].quantile(0.5)
    p75 = df[column].quantile(0.75)
    p90 = df[column].quantile(0.9)
    p95 = df[column].quantile(0.95)
    pmin = df[column].min()
    pmax = df[column].max()
    return p10, p25, p50, p75, p90, p95, pmin, pmax

In [None]:
p10, p25, p50, p75, p90, p95, pmin, pmax = percentile(df, 'total_tokens')
print(f'P10: {p10:.2f}')
print(f'P25: {p25:.2f}')
print(f'P50: {p50:.2f}')
print(f'P75: {p75:.2f}')
print(f'P90: {p90:.2f}')
print(f'P95: {p95:.2f}')
print(f'Pmin: {pmin:.2f}')
print(f'Pmax: {pmax:.2f}')

In [None]:
p10, p25, p50, p75, p90, p95, pmin, pmax = percentile(df, 'total_sentences')
print(f'P10: {p10:.2f}')
print(f'P25: {p25:.2f}')
print(f'P50: {p50:.2f}')
print(f'P75: {p75:.2f}')
print(f'P90: {p90:.2f}')
print(f'P95: {p95:.2f}')
print(f'Pmin: {pmin:.2f}')
print(f'Pmax: {pmax:.2f}')

<h3 style="text-align:center; background-color:black; color:white; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; padding: 1rem 0;">Data Visualization</h3>

In [None]:
# Plot the distribution of tokens, sentences and words with a pairplot using seaborn. Use pallette as blues with appropriate labels and title and size 10x10

sns.set_style('whitegrid')
plt.figure(figsize=(10, 10))
sns.pairplot(df[['total_tokens', 'total_sentences']])
plt.suptitle('Distribution of Tokens and Sentences')
plt.show()

In [None]:
# Heatmap of correlation between tokens and words, label it properly and use pallette as blues, define plot size as 15x15

plt.figure(figsize=(10, 8.125))
plt.title('Correlation between Tokens and Sentences')
sns.heatmap(df[['total_tokens', 'total_sentences']].corr(), annot=True, cmap='Blues')
plt.show()