In [1]:
import nltk
import numpy as np
import pandas as pd
import spacy
import pickle
from pathlib import Path
from typing import List
from fastai.text import SpacyTokenizer

In [2]:
OOS_CLASS = 'NO_NODES_DETECTED'

In [3]:
train_dir = Path('../train/')
test_dir = Path('../test/')

In [4]:
nlp = spacy.load('en_core_web_lg',  disable=['ner', 'parser'])

In [5]:
with open("/home/ubuntu/gaurav/.fastai/models/wt103-fwd/itos_wt103.pkl", "rb") as input_file:
    wiki_vocab = set(pickle.load(input_file))
print(len(wiki_vocab))
tokenizer = SpacyTokenizer('en')

60000


In [6]:
def dist(a: List[str], b: List[str], n: int = 3) -> float:
    answer = 1.0
    for i in range(1, n + 1):
        a_i = set(nltk.ngrams(a, i))
        b_i = set(nltk.ngrams(b, i))
        intersection = len(a_i & b_i)
        union = len(a_i | b_i)
        if union:
            answer -= (intersection / union)
    return answer
#     return max(0, answer)

In [7]:
def diversity(train_df: pd.DataFrame) -> float:
    div, labelsc = 0, 0
    for label, group_df in train_df.groupby('label'):
        acc = 0
        labelsc += 1
        for text_a in group_df['sentence_tokens']:
            for text_b in group_df['sentence_tokens']:
                d = dist(text_a, text_b)
                acc += d
        div += (acc / (len(group_df) * len(group_df)))
    return div / labelsc

In [8]:
def coverage(train_df: pd.DataFrame, test_df: pd.DataFrame) -> float:
    cov, labelsc = 0, 0
    for label, group_df in test_df.groupby('label'):
        if label == OOS_CLASS:
            continue
        labelsc += 1
        train_group = train_df[train_df['label'] == label]
        acc = 0
        for text_b in group_df['sentence_tokens']:
            acc += max(1.0 - dist(text_a, text_b) for text_a in train_group['sentence_tokens'])
        cov += (acc / len(group_df))
    return cov / labelsc

In [9]:
def read_file(path: Path) -> pd.DataFrame:
    print(f'Reading file {path}')
    df = pd.read_csv(str(path))
    df['sentence_tokens'] = df['sentence'].apply(lambda sent: [tok.text for tok in nlp(sent.lower().strip())])
    return df

In [10]:
def df_stats(df):
    all_toks = set()
    oov_toks = set()
    tok_lens = []
    for sentence in df['sentence']:
        doc = tokenizer.tokenizer(sentence.lower().strip())
        tok_lens.append(len(doc))
        for tok in doc:
            all_toks.add(tok)
            if tok not in wiki_vocab:
                oov_toks.add(tok)
    return {
        'len': len(df),
        'in-scope': len(df[df['label'] != OOS_CLASS]),
        'oos': len(df[df['label'] == OOS_CLASS]),
        'labels': len(df[df['label'] != OOS_CLASS]['label'].unique()),
        'tok_min': min(tok_lens),
        'tok_max': max(tok_lens),
        'tok_mean': np.mean(tok_lens),
        'tok_std': np.std(tok_lens),
        'oov_percentage': len(oov_toks) / len(all_toks),
    }

In [11]:
datasets = ['sofmattress', 'powerplay11', 'curekart']
for dataset in datasets:
    for suf in ['', '_subset']:
        train_df = read_file(train_dir / f'{dataset}{suf}_train.csv')
        test_df = read_file(test_dir / f'{dataset}_test.csv')
        print('train stats:', df_stats(train_df))
        print('test stats:', df_stats(test_df))
        print('Diversity:', diversity(train_df))
        print('Coverage:', coverage(train_df, test_df))

Reading file ../train/sofmattress_train.csv
Reading file ../test/sofmattress_test.csv
train stats: {'len': 328, 'in-scope': 328, 'oos': 0, 'labels': 21, 'tok_min': 1, 'tok_max': 28, 'tok_mean': 4.414634146341464, 'tok_std': 2.542090648688811, 'oov_percentage': 0.05084745762711865}
test stats: {'len': 397, 'in-scope': 231, 'oos': 166, 'labels': 20, 'tok_min': 1, 'tok_max': 53, 'tok_mean': 6.607052896725441, 'tok_std': 5.770746222932882, 'oov_percentage': 0.20937042459736457}
Diversity: 0.6168521547770577
Coverage: 0.4411860589631133
Reading file ../train/sofmattress_subset_train.csv
Reading file ../test/sofmattress_test.csv
train stats: {'len': 180, 'in-scope': 180, 'oos': 0, 'labels': 21, 'tok_min': 1, 'tok_max': 28, 'tok_mean': 5.338888888888889, 'tok_std': 2.828749014609518, 'oov_percentage': 0.049429657794676805}
test stats: {'len': 397, 'in-scope': 231, 'oos': 166, 'labels': 20, 'tok_min': 1, 'tok_max': 53, 'tok_mean': 6.607052896725441, 'tok_std': 5.770746222932882, 'oov_percentag