In [None]:
!pip install textstat
!pip install lftk
!pip install spacy
!pip install evaluate
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]



In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0


In [None]:
import spacy
import lftk
import transformers
import textstat
import torch
import pandas as pd
import accelerate
from evaluate import load
import evaluate
from datasets import load_metric
import xgboost as xgb
import numpy as np
import re, statistics, string
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import nltk
from nltk.util import ngrams
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
nltk.download('punkt')
from nltk.tokenize import word_tokenize
np.random.seed(0)
%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
bert_model = AutoModel.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]



---

## Utility Functions

In [None]:
def clean(text):
  text = str(text)
  text = re.sub('\n',' ',text)
  text = re.sub(' +',' ',text)
  return text

In [None]:
def remove_outliers(data):
    """
    Removes outliers from a list of numbers using the Interquartile Range (IQR) method.

    Parameters:
    - data: list of numbers (integers or floats)

    Returns:
    - list of numbers with outliers removed
    """
    # Calculate Q1, Q3, and IQR
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1

    # Define bounds for non-outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out outliers
    filtered_data = [x for x in data if lower_bound <= x <= upper_bound]

    return filtered_data

In [None]:
def remove_prompt_format(prompt, filename):

  prompt = clean(prompt)

  if "llama" in filename:
    if "[/INST]" in prompt:
      splitted_prompt = prompt.split("[/INST]")
      prompt = splitted_prompt[-1]

  elif "longform" in filename:
    if "Output the generated story directly." in prompt:
      splitted_prompt = prompt.split("Output the generated story directly.")
      prompt = splitted_prompt[-1]

  elif "openchat" in filename:
    if "Output the generated story directly." in prompt:
      splitted_prompt = prompt.split("Output the generated story directly.")
      prompt = splitted_prompt[-1]

  return prompt

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
def get_text_features(text):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(text)
  LFTK = lftk.Extractor(docs = doc)
  LFTK.customize(stop_words=False, punctuations=False, round_decimal=5)

  surface = lftk.search_features(domain = 'surface', language ='en', return_format = 'list_key')
  syntax = lftk.search_features(domain = 'syntax', language ='en',return_format = 'list_key')
  lexico_semantics = lftk.search_features(domain = 'lexico-semantics', language ='en', return_format = 'list_key')
  discourse = lftk.search_features(domain = 'discourse', language ='en',return_format = 'list_key')

  feature_list = surface + syntax + lexico_semantics + discourse
  extracted_features = LFTK.extract(features = feature_list)

  extracted_feature_list = list(extracted_features.values())

  # returns a list containing the extracted features
  return extracted_feature_list

In [None]:
def get_sbert_embeddings(text):
  encoded_input = bert_tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')

  #Compute token embeddings
  with torch.no_grad():
      model_output = bert_model(**encoded_input)

  #Perform pooling. In this case, mean pooling
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
  sentence_embeddings_np = sentence_embeddings.numpy()
  return sentence_embeddings_np[0].tolist()

In [None]:
def get_text_features_xgb(text):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(text)
  LFTK = lftk.Extractor(docs = doc)
  LFTK.customize(stop_words=False, punctuations=False, round_decimal=5)

  feature_list = ['t_word', 't_stopword', 't_syll', 't_syll2', 't_syll3', 't_sent','t_char', 'a_word_ps', 'fkre', 'fkgl', 'fogi', 'smog', 'cole', 'auto','a_adj_ps', 'a_adp_ps', 'a_adv_ps', 't_kup', 't_bry','t_subtlex_us_zipf', 'a_kup_ps', 'a_bry_ps', 'a_subtlex_us_zipf_pw','a_subtlex_us_zipf_ps', 'corr_noun_var']
  extracted_features = LFTK.extract(features = feature_list)

  extracted_feature_list = list(extracted_features.values())

  sbert_embeds = get_sbert_embeddings(text)

  # returns a list containing the extracted features
  #print(extracted_feature_list)
  combined_features = sbert_embeds + extracted_feature_list
  return combined_features

In [None]:
def get_level(text, model):
  text_features = get_text_features(text)
  text_features_np = np.array([text_features])  # Convert to 2D array
  predictions = model.predict(text_features_np)
  return predictions[0]

In [None]:
def get_level_xgb(text, model, column_names):
  text_features = get_text_features_xgb(text)
  text_features_df = pd.DataFrame([text_features], columns=column_names)
  text_data_dmatrix = xgb.DMatrix(text_features_df)
  model_prob = model.predict(text_data_dmatrix)

  #gives a string which is the label directly
  return int(model_prob[0])

In [None]:
def get_distinct_n(text, n):

  words = word_tokenize(text)
  ngram_lists = list(ngrams(words, n))
  ngrams_items = [item for sublist in ngram_lists for item in sublist]  # flatten

  # returns real number
  return len(set(ngrams_items)) / len(ngrams_items) if len(ngrams_items) > 0 else 0.



---

## Main Script

In [None]:
# CEFR - cambridge_features.csv
# CCS - commoncore_10_features_bin.csv
features_df = pd.read_csv("commoncore_10_features_bin_with_sbert.csv")
features_df.head()

Unnamed: 0,sbert_1,sbert_2,sbert_3,sbert_4,sbert_5,sbert_6,sbert_7,sbert_8,sbert_9,sbert_10,...,a_adv_ps,t_kup,t_bry,t_subtlex_us_zipf,a_kup_ps,a_bry_ps,a_subtlex_us_zipf_pw,a_subtlex_us_zipf_ps,corr_noun_var,level
0,0.3,-0.255,-0.0445,0.244,0.513,0.035,-0.22,0.0327,0.0845,0.0738,...,0.81553,5757.81,5197.94127,7259.26996,55.90107,50.46545,13.36882,70.47835,5.99645,1
1,0.119,0.0903,-0.13,0.247,0.34,0.0817,0.0111,0.0693,0.116,0.0088,...,0.37931,1001.62,909.90801,1217.22093,34.53862,31.37614,12.54867,41.97314,2.53898,1
2,0.327,-0.368,-0.245,0.0728,0.415,-0.195,-0.0368,-0.0511,-0.108,-0.00567,...,1.04082,4752.32,4052.65006,5217.80707,96.98612,82.70714,14.86555,106.48586,5.98215,1
3,0.194,-0.116,-0.123,0.352,0.177,-0.0724,0.12,0.000226,0.153,-0.119,...,0.42857,1034.07,870.80984,1102.5776,73.86214,62.2007,14.8997,78.75554,3.83761,1
4,0.2,-0.157,-0.0504,0.339,0.301,-0.142,-0.101,0.000105,0.0728,0.0562,...,0.61905,927.49,776.53675,1073.50981,44.16619,36.97794,11.54312,51.11951,3.05505,1


In [None]:
y = features_df['level']
X = features_df.drop('level', axis=1)

y = y - 1

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Create and train the XGBoost model for CCS

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {
    'objective': 'multi:softmax',
    'num_class': 2,  # Number of unique classes
    'max_depth': 3,  # Depth of the trees
    'eta': 0.1,      # Learning rate
    'verbosity': 1   # Verbosity of printing messages
}
num_round = 100  # Number of boosting rounds

xgb_model = xgb.train(params, dtrain, num_round)
predictions = xgb_model.predict(dtest)
accuracy = accuracy_score(y_test, predictions)
print(f"XGBoost Accuracy: {accuracy}")

XGBoost Accuracy: 0.9166666666666666


In [None]:
# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train.values, y_train.values)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy}")

Random Forest Accuracy: 0.9117647058823529




In [None]:
# Plus prompt and generation data here
# CEFR - elg_data.csv
# CCS - coca_data.csv
prompt_file_name = "coca_data.csv"
generation_file_name = "coca_longform_3b_aspect.csv"

prompt_df = pd.read_csv(prompt_file_name)
gen_df = pd.read_csv(generation_file_name)

prompt_list = prompt_df['prompt'].tolist()
level_list = gen_df['level'].tolist()
level_list = [x - 1 for x in level_list]
generation_list_raw = gen_df['generated_story'].tolist()

In [None]:
generation_list = []
counter = 1
for i in generation_list_raw:
  #print(counter)
  cleaned_generation = remove_prompt_format(i, generation_file_name)
  generation_list.append(cleaned_generation)
  counter += 1

In [None]:
prediction_labels = []
distinct_n_scores = []
for prompt, generation in zip(prompt_list, generation_list):

  # Calculate distinct_n
  distinct_n = (get_distinct_n(remove_prompt_format(generation,generation_file_name),3))
  distinct_n_scores.append(distinct_n)

  # Calculate accuracy
  print(generation)
  print(len(X_train.columns))
  prediction = get_level_xgb(generation, xgb_model, X_train.columns) # change to get_level_xgb and xgb_model for XGBoost and CCS
  #prediction = get_level(generation, rf_model)
  print(prediction)
  prediction_labels.append(prediction)


 A river of flames ran down the middle of the street, burning cars and people. It had been raining for a while, so the streets were covered with thick mud. All of a sudden, the rain stopped, and the sun came out. The fire moved from the middle of the street to the other side, and then the rain came back again. After that, the sun stayed out, so the mud disappeared again, and people woke up the next morning to a beautiful day. The story ends on this beautiful day.Mr. Anderson had been on holiday for the last few weeks, and on his return he noticed that his house had become rather messy. All of the furniture was turned upside down, the bed was pushed to the wall and Mr. Anderson had lost all of his valuable books. The police were informed, and it seemed that it was a case of robbery, as all of the valuable stuff had been stolen. But no-one was found who had done it. The house was cleaned up and Mr. Anderson found that everything was in order again. He was grateful that no-one had broken 



0
 Two people are sitting at a café, drinking coffee and eating pancakes. The first guy orders a cappuccino, but the waitress mistakes it for a cappuccino and gives him a latte. To prove it to the waitress, he orders another cappuccino and asks the waitress for a latte as well. Then the waitress takes out the right cappuccino and gives him the wrong one. The second guy, watching this, laughs and says: "I can't believe that!" The first guy looks sad and says, "We're in trouble now!" To which the other guy replies: "No, no, no, I just made a mistake. Don't worry about it!" They both laugh and take the cappuccino/latte mix and put it back. Then, with a sip of their coffee, they take another bite of their pancakes and sip their coffee, and continue laughing together. Now, the first guy asks: "So what were you doing?" The other guy says: "I was eating pancakes." The first guy responds: "Pancakes have a mystery, too. They're hard to eat, because they're long and have lots of short words in t

In [None]:
report = classification_report(level_list, prediction_labels, digits=3)
print(report)

              precision    recall  f1-score   support

           0      0.472     0.840     0.604        50
           1      0.273     0.060     0.098        50

    accuracy                          0.450       100
   macro avg      0.372     0.450     0.351       100
weighted avg      0.372     0.450     0.351       100



In [None]:
# print distinct n
print(statistics.mean(distinct_n_scores))
print(statistics.pstdev(distinct_n_scores))

0.1454259559246542
0.05186151006065162


In [None]:
def map_labels_to_numbers(labels, label_order):
    return np.array([label_order.index(label) for label in labels])

def adjacent_accuracy(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    correct = np.abs(y_true - y_pred) <= 1
    return np.mean(correct)

# Define the order of your class labels
label_order = ['A2', 'B1', 'B2', 'C1', 'C2']

# Map labels to numbers
mapped_gold_standard = map_labels_to_numbers(level_list, label_order)
mapped_predicted_labels = map_labels_to_numbers(prediction_labels, label_order)

# Calculate adjacent accuracy
accuracy = adjacent_accuracy(mapped_gold_standard, mapped_predicted_labels)
print("Adjacent Accuracy:", accuracy)

ValueError: 0 is not in list

In [None]:
metric = load_metric('frugalscore.py')
frugal_dict = metric.compute(predictions = prompt_list, references = generation_list, batch_size=4, max_length=300)
frugal_scores = frugal_dict['scores']
frugal_score_mean = statistics.mean(frugal_scores)
frugal_score_std = statistics.pstdev(frugal_scores)
print("Frugalscore mean:",frugal_score_mean)
print("Frugalscore std:",frugal_score_std)

In [None]:
filtered_generation_list = [seq for seq in generation_list if len(seq.split()) >= 20]

In [None]:
perplexity_scores = []
perplexity = load("perplexity", module_type="metric")
perplexity_result = perplexity.compute(model_id='gpt2', add_start_token=False, predictions=filtered_generation_list)
perplexity_scores = perplexity_result['perplexities']

In [None]:
perplexity_scores = remove_outliers(perplexity_scores)
perplexity_mean = statistics.mean(perplexity_scores)
perplexity_std = statistics.pstdev(perplexity_scores)
print("Perplexity mean:",perplexity_mean)
print("Perplexity std:",perplexity_std)