# CEFR Dataset Creation

Note: 

Word to CEFR level maps are fetched from: 
- kaggle_word_cefr_map: https://www.kaggle.com/datasets/nezahatkk/10-000-english-words-cerf-labelled
- oxford_word_cefr_map:  https://www.oxfordlearnersdictionaries.com/wordlists/oxford3000-5000?dataset=english&list=ox5000

In [1]:
import spacy 
import pandas as pd

In [2]:
nlp = spacy.load('en_core_web_trf')

  model.load_state_dict(torch.load(filelike, map_location=device))


In [3]:
oxford_map = pd.read_csv('oxford_dataset/oxford_word_cefr_map.csv')
kaggle_map = pd.read_csv('kaggle_dataset/kaggle_word_cefr_map_filtered.csv')

print(oxford_map['pos'].value_counts())

oxford_dict = oxford_map.set_index(['text', 'pos'])['cefr'].to_dict()
oxford_dict_word_only = oxford_map.set_index('text')['cefr'].to_dict()
kaggle_dict = kaggle_map.set_index('headword')['CEFR'].to_dict()


pos
noun                  2958
verb                  1247
adjective             1076
adverb                 366
pronoun                 77
preposition             66
determiner              42
number                  33
conjunction             32
exclamation             20
modal verb              14
ordinal number           5
auxiliary verb           3
definite article         1
indefinite article       1
linking verb             1
infinitive marker        1
Name: count, dtype: int64


In [None]:
def process_sentence(sentence):
    doc = nlp(sentence)
    tokens_data = []
    for token in doc:
        tokens_data.append({
            'word': token.text,
            'lemma': token.lemma_,
            'pos': token.pos_,
            'ner': token.ent_type_ if token.ent_type_ else 'O'
        })
    return tokens_data

In [None]:
cefr_weights = {
    'A1': 1,
    'A2': 2,
    'B1': 4,
    'B2': 7,
    'C1': 15,
    'C2': 30
}
def cefr_to_weighted_numeric(cefr_level):
    return cefr_weights.get(cefr_level, 1)

def numeric_to_cefr(numeric_value):
    if numeric_value <= 1.7:
        return 'A1'
    elif numeric_value <= 2.5:
        return 'A2'
    elif numeric_value <= 3.5:
        return 'B1'
    elif numeric_value <= 5.5:
        return 'B2'
    elif numeric_value <= 8.0:
        return 'C1'
    else:
        return 'C2'

def get_cefr_level(word, pos):
    word_lower = word.lower()

    cefr_level = oxford_dict.get((word_lower, pos.lower()))
    if cefr_level:
        return cefr_level.upper()

    cefr_level = oxford_dict_word_only.get(word_lower)
    if cefr_level:
        return cefr_level.upper()

    # cefr_level = kaggle_dict.get(word_lower)
    # if cefr_level:
    #     return cefr_level.upper()
    
    return 'Unknown'

In [None]:
def calculate_sentence_cefr_weighted_percentile(tokens_data):
    pos_map = {
        'ADJ': 'adjective', 
        'ADV': 'adverb',
        'VERB': 'verb',
        'NOUN': 'noun',
        'AUX': 'auxiliary verb',
        'PRON': 'pronoun',
        'SCONJ': 'conjunction',
        'CCONJ': 'conjunction',
        'DET': 'determiner',
        'INTJ': 'interjection',
        'NUM': 'number',
    }
    
    weighted_levels = [cefr_to_weighted_numeric(get_cefr_level(token['lemma'], token['pos'])) for token in tokens_data] # if token['pos'] in pos_map]
    if not weighted_levels:
        return 'Unknown'
    
    weighted_sum = sum(weighted_levels)
    weighted_avg = weighted_sum / len(weighted_levels)

    return numeric_to_cefr(weighted_avg)

In [None]:
def assign_cefr_to_sentence(sentence):
    tokens_data = process_sentence(sentence)
    sentence_cefr_level = calculate_sentence_cefr_weighted_percentile(tokens_data)
    return sentence_cefr_level

In [None]:
# sentence = f'''If one were to  apply just a hint more kinetic force to the feline's plaything, the creature might shed its lethargic nature and engage in jollity.'''
# cefr_level = assign_cefr_to_sentence(sentence)
# print(f"Sentence CEFR Level: {cefr_level}")

In [10]:
df = pd.read_csv('wikisplit_dataset/wikisplit_dataset_original.csv')
print(len(df.index))

batch_size = 10000
for start in range(0, len(df), batch_size):
    end = start + batch_size
    print(f'Processing batch {start} to {end}')
    batch_df = df.iloc[start:end].copy()

    batch_df['level'] = batch_df['text'].apply(assign_cefr_to_sentence)
    batch_df.to_csv(f'wikisplit/wikisplit_dataset_original_classified_{start}.csv', index=False)
    print(f'Batch {start} to {end} has been saved')

print("Processing complete.")

  with torch.cuda.amp.autocast(self._mixed_precision):


KeyboardInterrupt: 

In [8]:
import glob

csv_files = glob.glob(f'wikisplit_dataset/wikisplit_dataset_original_classified_*.csv')

first_number = int(csv_files[0].split('_')[-1].split('.')[0])
last_number = int(csv_files[-1].split('_')[-1].split('.')[0])

dataframes = [pd.read_csv(file) for file in csv_files]

combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.to_csv(f'wikisplit_dataset/wikisplit_dataset_original_classified_combined{first_number}_{last_number}.csv', index=False)

In [11]:
df = pd.read_csv('wikisplit_dataset/wikisplit_dataset_original_classified_combined0_90000.csv')

df_a1 = df[df['level'] == 'A1'].sample(2000, random_state=42)
df_a2 = df[df['level'] == 'A2'].sample(2000, random_state=42)
df_b1 = df[df['level'] == 'B1'].sample(2000, random_state=42)
df_b2 = df[df['level'] == 'B2'].sample(2000, random_state=42)
df_c1 = df[df['level'] == 'C1'].sample(2000, random_state=42)
df_c2 = df[df['level'] == 'C2'].sample(2000, random_state=42)


combined_df = pd.concat([df_a1, df_a2, df_b1, df_b2, df_c1, df_c2]).reset_index(drop=True)

def split_level(df, level):
    df_level = df[df['level'] == level]
    train = df_level.sample(1600, random_state=42)
    remaining = df_level.drop(train.index)
    test = remaining.sample(200, random_state=42)
    validation = remaining.drop(test.index)
    return train, test, validation

train_a1, test_a1, val_a1 = split_level(combined_df, 'A1')
train_a2, test_a2, val_a2 = split_level(combined_df, 'A2')
train_b1, test_b1, val_b1 = split_level(combined_df, 'B1')
train_b2, test_b2, val_b2 = split_level(combined_df, 'B2')
train_c1, test_c1, val_c1 = split_level(combined_df, 'C1')
train_c2, test_c2, val_c2 = split_level(combined_df, 'C2')

train_df = pd.concat([train_a1, train_a2, train_b1, train_b2, train_c1, train_c2]).reset_index(drop=True)
test_df = pd.concat([test_a1, test_a2, test_b1, test_b2, test_c1, test_c2]).reset_index(drop=True)
val_df = pd.concat([val_a1, val_a2, val_b1, val_b2, val_c1, val_c2]).reset_index(drop=True)

train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
val_df.to_csv('validation.csv', index=False)

In [2]:
from transformers import AutoTokenizer

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Original text
text = "Tokenization is the process of breaking down text into smaller units called tokens."

# Tokenize the text
tokens = tokenizer.tokenize(text)



In [3]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)

['token', '##ization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'text', 'into', 'smaller', 'units', 'called', 'token', '##s', '.']


In [4]:

print(token_ids)

[19204, 3989, 2003, 1996, 2832, 1997, 4911, 2091, 3793, 2046, 3760, 3197, 2170, 19204, 2015, 1012]


In [5]:
# Create a DataFrame to display
import pandas as pd

df_tokens = pd.DataFrame({'Token': tokens, 'Token ID': token_ids})
print(df_tokens)

        Token  Token ID
0       token     19204
1   ##ization      3989
2          is      2003
3         the      1996
4     process      2832
5          of      1997
6    breaking      4911
7        down      2091
8        text      3793
9        into      2046
10    smaller      3760
11      units      3197
12     called      2170
13      token     19204
14        ##s      2015
15          .      1012
