# EDA

In [None]:
import altair as alt
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from typing import List, Set
import random

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, accuracy_score

In [None]:
RANDOM_SEED = 17
project_base = os.path.dirname(os.path.realpath('.'))
print(f'Project base path: {project_base}')

In [None]:
data_path = os.path.join(project_base, 'data', 'WikiLarge_Train.csv')
full_df = pd.read_csv(data_path)
print(f'full_df column names: {list(full_df)}')
print(f'full training data df shape: {full_df.shape}')

In [None]:
# balanced split between the two classes
full_df.label.value_counts()

In [None]:
full_df.head()

In [None]:
# split the sentence into the induvidual words
# no cleaning/tokenization/lemmatization
# TODO: try tokenizing or lemmatizing to get more matches with external data sources
full_df['sentence_word_list'] = full_df.original_text.str.split()

In [None]:
# number of words in sentence
full_df['word_count'] = full_df.sentence_word_list.str.len()

In [None]:
full_df.head()

In [None]:
word_length_distribution_df = pd.DataFrame(full_df.groupby('word_count').agg({'label':['count', 'mean']})).reset_index()

In [None]:
word_length_distribution_df.columns = ['word_count', 'sentence_count', 'label']

In [None]:
word_length_distribution_df['majority_class_prob'] = word_length_distribution_df['label'].apply(lambda a: max(a, 1-a))

In [None]:
alt.Chart(word_length_distribution_df).mark_bar().encode(
        x=alt.X('word_count', title="Word Count"),
        y=alt.Y('label', title="Need To Be Simplified Percent",axis=alt.Axis(format='%')))

In [None]:
line = alt.Chart(word_length_distribution_df).mark_line(color='orange').encode(
        x=alt.X('word_count', title='Word Count'),
        y=alt.Y('majority_class_prob', title="Majority Class Percent",axis=alt.Axis(format='%')))

bar = alt.Chart(word_length_distribution_df).mark_bar(opacity=0.7).encode(
        x=alt.X('word_count', title='Word Count'),
        y=alt.Y('sentence_count', title="Sentence Count"))
(bar+line).resolve_scale(y='independent')

In [None]:
X_train, X_test =  train_test_split(full_df, random_state=RANDOM_SEED)

In [None]:
model_lookup_df = pd.DataFrame(X_train.groupby('word_count').mean()['label']).reset_index()

In [None]:
lower_bound = 0.45
upper_bound = 0.55
correct_count = 0

# for every sentence, predict the majority class based on sentence length
# if the ratio of need to be simplified : does not need to be simplified
# is greater than the lower_bound and less than the upper_bound
# make a random guess
for idx, row in X_test.iterrows():
    current_word_len = row.word_count
    word_1_prob = model_lookup_df.loc[model_lookup_df['word_count'] == current_word_len, 'label'].values[0]
    if word_1_prob < lower_bound or word_1_prob > upper_bound:
        prediction = int(np.round(word_1_prob))
    else:
        prediction = random.choice([0,1])
    if prediction == row.label:
        correct_count += 1

print(f'{correct_count} of {len(X_test)} guesses correct - {(correct_count / len(X_test)*100):.2f}% correct')
# 60.2%

In [None]:
print(f'{correct_count} of {len(X_test)} guesses correct - {(correct_count / len(X_test)*100):.2f}% correct')

In [None]:
x_test_len = len(X_test)
for lower_bound in np.linspace(0.4, 0.49, 5):
    for upper_bound in np.linspace(0.5, 0.6, 5):
        correct_count = 0

        for idx, row in X_test.iterrows():
            current_word_len = row.word_count
            word_1_prob = model_lookup_df.loc[model_lookup_df['word_count'] == current_word_len, 'label'].values[0]
            if word_1_prob < lower_bound or word_1_prob > upper_bound:
                prediction = int(np.round(word_1_prob))
            else:
#                 prediction = random.choices([0,1], weights=[1-word_1_prob, word_1_prob])
                prediction = random.choice([0,1])
            if prediction == row.label:
                correct_count += 1
        print(f'Lower Bound: {lower_bound} - Upper Bound: {upper_bound}')
        print(f'{correct_count} of {x_test_len} guesses correct - {(correct_count / x_test_len*100):.2f}% correct')