In [1]:
import textstat
import spacy
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.data.path.append('/Users/ihsanullah/nltk_data')
from nltk.tokenize import word_tokenize, sent_tokenize
nlp = spacy.load("en_core_web_sm")
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import load_from_disk

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ihsanullah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ihsanullah/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Load and Prepare Review-5k dataset train and test set

In [2]:
def load_train_and_test_dataset():

    def _extract_content_rating_decision_from_paper(paper):
        paper_content = None
        for msg in paper["messages"]:
            if msg["role"] == "user" and msg["content"].startswith("Title"):
                paper_content = msg
                break
        return {
            "paper_content": paper_content,
            "paper_ratings": paper["rates"],
            "paper_decision": 1 if "accept" in paper["decision"].lower() else 0
        }



    dataset = load_from_disk("../ReproduceCycleReviewer/review-5k-dataset")

    train_set = []
    for row in dataset["train"]:
        train_set.append(_extract_content_rating_decision_from_paper(row))

    test_set = []
    for row in dataset["test"]:
        test_set.append(_extract_content_rating_decision_from_paper(row))

    return train_set, test_set

### Compute TFIDF of top 2000 words

In [3]:
def compute_tfidf(train_papers, test_papers, max_features=2000):
    train_docs = [p["paper_content"]["content"] for p in train_papers]
    test_docs = [p["paper_content"]["content"] for p in test_papers]

    vectorizer = TfidfVectorizer(max_features=max_features)
    train_tfidf = vectorizer.fit_transform(train_docs).toarray()
    test_tfidf = vectorizer.transform(test_docs).toarray()

    feature_names = vectorizer.get_feature_names_out()

    return train_tfidf, test_tfidf, feature_names

### Compute macroscopic features
AI-assisted peer review paper: https://www.nature.com/articles/s41599-020-00703-8.pdf 

In [4]:
def lix_index(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    long_words = [w for w in words if len(w) > 6]
    return len(words) / len(sentences) + (100 * len(long_words)) / len(words)

def rix_index(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    long_words = [w for w in words if len(w) > 6]
    return len(long_words) / len(sentences)

def extract_macro_features(papers):
    macro_features = []

    for paper in papers:
        text = paper["paper_content"]["content"]
        words = word_tokenize(text)
        sentences = sent_tokenize(text)
        word_count = len(words)
        sentence_count = len(sentences)
        char_count = len(text)
        letter_count = sum(c.isalpha() for c in text)
        syllable_count = textstat.syllable_count(text)
        polysyllable_count = textstat.polysyllabcount(text)

        features = [
            textstat.automated_readability_index(text),  # ariIndex
            letter_count / word_count if word_count else 0,  # alpwIndex
            char_count / word_count if word_count else 0,  # acpwIndex
            word_count / sentence_count if sentence_count else 0,  # aslIndex
            textstat.avg_syllables_per_word(text),  # asspwIndex
            char_count,  # ccIndex
            textstat.coleman_liau_index(text),  # cliIndex
            textstat.dale_chall_readability_score(text),  # dcrsIndex
            textstat.difficult_words(text) / word_count if word_count else 0,  # dwIndex
            textstat.flesch_kincaid_grade(text),  # fkgIndex
            textstat.flesch_reading_ease(text),  # freIndex
            textstat.gunning_fog(text),  # gfIndex
            letter_count,  # lcIndex
            textstat.lexicon_count(text, removepunct=True),  # llcIndex
            textstat.linsear_write_formula(text),  # lwfIndex
            lix_index(text),  # lixIndex
            polysyllable_count,  # pscIndex
            rix_index(text),  # rixIndex
            sentence_count,  # scIndex
            textstat.smog_index(text),  # siIndex
            syllable_count,  # sscIndex
            len(text)  # txtlength
        ]
        macro_features.append(features)

    return np.array(macro_features)

### Combine TFIDF and Macro features

In [5]:
def combine_features_to_dataframe(papers, tfidf_matrix, macro_matrix):
    combined_data = []
    
    for i, paper in enumerate(papers):
        row = list(tfidf_matrix[i]) + list(macro_matrix[i])
        row.append(paper["paper_decision"])
        combined_data.append(row)
    
    tfidf_columns = [f"tfidf_{i}" for i in range(tfidf_matrix.shape[1])]
    macro_columns = [
        "ariIndex", "alpwIndex", "acpwIndex", "aslIndex", "asspwIndex", "ccIndex",
        "cliIndex", "dcrsIndex", "dwIndex", "fkgIndex", "freIndex", "gfIndex",
        "lcIndex", "llcIndex", "lwfIndex", "lixIndex", "pscIndex", "rixIndex",
        "scIndex", "siIndex", "sscIndex", "txtlength"
    ]
    columns = tfidf_columns + macro_columns + ["label"]

    return pd.DataFrame(combined_data, columns=columns)

### Run Code

In [6]:
# Step 1: Load train and test sets
train_set, test_set = load_train_and_test_dataset()

# Step 2: Compute TF-IDF features
train_tfidf, test_tfidf, feature_names = compute_tfidf(train_set, test_set)

# Step 3: Compute macro features
train_macro = extract_macro_features(train_set)
test_macro = extract_macro_features(test_set)

# Step 4: Combine TF-IDF + macro + label into DataFrames
train_df = combine_features_to_dataframe(train_set, train_tfidf, train_macro)
test_df = combine_features_to_dataframe(test_set, test_tfidf, test_macro)

In [7]:
train_df

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,llcIndex,lwfIndex,lixIndex,pscIndex,rixIndex,scIndex,siIndex,sscIndex,txtlength,label
0,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,6145.0,16.400000,48.127049,1360.0,5.728324,346.0,14.201968,11025.0,39557.0,0
1,0.02334,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,5671.0,10.285714,53.936136,1052.0,7.065789,228.0,13.148455,9505.0,36474.0,1
2,0.00000,0.000000,0.000000,0.0,0.007687,0.004152,0.086146,0.0,0.000000,0.000000,...,5078.0,18.500000,55.127060,952.0,7.328358,201.0,14.081607,8458.0,31532.0,0
3,0.00000,0.005448,0.007081,0.0,0.163182,0.019585,0.035558,0.0,0.000000,0.000000,...,5167.0,16.400000,61.580211,1154.0,9.407080,226.0,13.747880,9156.0,36219.0,0
4,0.00000,0.000000,0.000000,0.0,0.004982,0.000000,0.000000,0.0,0.004994,0.002831,...,7007.0,14.800000,53.087723,1439.0,6.990476,315.0,14.290140,11939.0,45717.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4184,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,6394.0,24.000000,50.670433,1134.0,6.380952,294.0,13.900102,10742.0,40566.0,1
4185,0.00000,0.010238,0.000000,0.0,0.000000,0.003067,0.000000,0.0,0.000000,0.000000,...,5574.0,14.400000,49.800363,929.0,6.193050,259.0,11.686713,9003.0,35625.0,1
4186,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,5616.0,20.000000,45.005810,1231.0,5.062162,370.0,13.842811,9956.0,39082.0,0
4187,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,6178.0,20.250000,51.725331,1142.0,6.647687,281.0,13.583541,10131.0,40208.0,1


In [8]:
test_df

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,llcIndex,lwfIndex,lixIndex,pscIndex,rixIndex,scIndex,siIndex,sscIndex,txtlength,label
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,6340.0,16.75,40.748415,790.0,4.148990,396.0,10.591319,9920.0,38059.0,1
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003026,0.000000,...,5391.0,17.00,49.222270,1071.0,6.049296,284.0,12.273422,9280.0,35024.0,1
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003180,0.000000,...,5037.0,16.80,51.171587,1004.0,6.409910,222.0,12.366424,8477.0,32040.0,1
3,0.000000,0.000000,0.000000,0.000000,0.002930,0.000000,0.003283,0.000000,0.002937,0.000000,...,5816.0,21.50,49.762097,1383.0,6.098101,316.0,14.617062,10510.0,38621.0,0
4,0.000000,0.000000,0.019705,0.005615,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,4880.0,19.25,54.597719,862.0,7.172973,185.0,14.093646,8083.0,31036.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776,0.009067,0.000000,0.000000,0.000000,0.000000,0.009488,0.009844,0.000000,0.011742,0.009985,...,5269.0,20.75,49.592122,1096.0,6.135135,259.0,13.152734,9025.0,34494.0,1
777,0.004040,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004388,0.000000,0.004449,...,4563.0,20.50,51.348570,783.0,6.437811,201.0,12.614740,7559.0,29250.0,1
778,0.000000,0.000000,0.005855,0.006674,0.007496,0.004049,0.000000,0.000000,0.000000,0.000000,...,5370.0,18.75,55.566193,1111.0,7.701754,228.0,12.125380,9318.0,35893.0,0
779,0.003986,0.055694,0.000000,0.000000,0.007723,0.012514,0.004328,0.004329,0.011614,0.004389,...,5829.0,20.00,61.371622,1303.0,9.375566,221.0,13.215333,10759.0,41280.0,0


### Save train,  test df, feature_names

In [9]:
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)
with open("tfidf_features.txt", "w", encoding="utf-8") as f:
        for word in feature_names:
            f.write(word + "\n")