In [98]:
import pandas as pd

In [99]:
df = pd.read_csv('datasets/processed/goodreads_reviews_with_nlp_features_substantiveness.csv')

print("Post-Data Clean row count:", len(df),"\n")
print(df.head())

Post-Data Clean row count: 1685280 

                            user_id                         review_id  \
0  8842281e1d1347389f2ab93d60773d4d  5e212a62bced17b4dbe41150e5bb9037   
1  8842281e1d1347389f2ab93d60773d4d  2ede853b14dc4583f96cf5d120af636f   
2  8842281e1d1347389f2ab93d60773d4d  022bb6daffa49adc27f6b20b6ebeb37d   
3  8842281e1d1347389f2ab93d60773d4d  0e317947e1fd341f573192111bb2921d   
4  8842281e1d1347389f2ab93d60773d4d  4276918357312212384ac6415ceb9159   

                                         review_text  rating  \
0  I haven't read a fun mystery book in a while a...       3   
1  A fun, fast paced science fiction thriller. I ...       3   
2  An amazing and unique creation: JJ Abrams and ...       4   
3  The Name of the Rose is a thrilling Dan Brown-...       3   
4  ** spoiler alert ** \n Hooked me equally as we...       3   

                       date_added  n_votes  contains_link  sentence_count  \
0  Mon Jul 24 02:48:17 -0700 2017        6          False     

### Since tier two, remove all entries that contain a link, focus on entries without link for quality detection

In [100]:
# Remove entries with links (Tier 1)
df = df[df['contains_link'] == False].copy()
print("After removing link-containing entries:", len(df))

print(df[["n_votes", "sentence_count","avg_words_per_sentence","word_count","lexical_diversity"]].describe())

        

After removing link-containing entries: 1633900
            n_votes  sentence_count  avg_words_per_sentence    word_count  \
count  1.633900e+06    1.633900e+06            1.633900e+06  1.633900e+06   
mean   1.042780e+00    7.103820e+00            1.703895e+01  1.319058e+02   
std    6.195860e+00    8.344157e+00            8.904341e+00  1.711560e+02   
min   -3.000000e+00    1.000000e+00            1.000000e+00  1.000000e+00   
25%    0.000000e+00    2.000000e+00            1.166667e+01  3.000000e+01   
50%    0.000000e+00    4.000000e+00            1.625000e+01  7.100000e+01   
75%    0.000000e+00    9.000000e+00            2.111765e+01  1.660000e+02   
max    9.040000e+02    3.940000e+02            1.537000e+03  4.641000e+03   

       lexical_diversity  
count       1.633900e+06  
mean        7.718246e-01  
std         1.512935e-01  
min         0.000000e+00  
25%         6.567164e-01  
50%         7.727273e-01  
75%         8.936170e-01  
max         1.000000e+00  


In [101]:
# Add extra engineered (combined) features for better modeling: 

# Interaction features
df['sentence_word_interaction'] = df['sentence_count'] * df['word_count']
df['sentence_avgword_interaction'] = df['sentence_count'] * df['avg_words_per_sentence']
df['lexical_sentence_interaction'] = df['lexical_diversity'] * df['sentence_count']

# Ratio-based features
df['words_per_sentence_ratio'] = df['word_count'] / (df['sentence_count'] + 1e-5)
df['unique_words_per_sentence'] = (df['lexical_diversity'] * df['word_count']) / (df['sentence_count'] + 1e-5)


In [102]:
def assign_substantiveness_label(row):
    sc = row['sentence_count']
    awps = row['avg_words_per_sentence']
    wc = row['word_count']
    ld = row['lexical_diversity']
    nv = row['n_votes']

    # Highest priority (5): way above 75th percentile on most metrics
    if sc > 3 and wc > 60 and awps > 13 and ld > 0.675 and nv >= 0:
        return 5

    # High priority (4): above 75th percentile on most metrics
    if sc > 3 and wc > 40 and awps > 11 and ld > 0.6 and nv >= 0:
        return 4

    # Medium priority (3): around median–75th percentile
    if sc >= 2 and wc >= 35 and awps >= 9 and ld > 0.575 and nv >= 0:
        return 3

    # Low-medium (2): above minimal substance (25th percentile)
    if sc >= 2 and wc >= 17 and awps >= 7 and ld > 0.50 and nv > -.05:
        return 2

    # Low priority (1): everything else
    return 1

df['substantiveness_label'] = df.apply(assign_substantiveness_label, axis=1)
print(df['substantiveness_label'].value_counts())

substantiveness_label
1    354058
3    332018
2    325457
5    315572
4    306795
Name: count, dtype: int64


In [103]:
# Save the dataset with the new column
output_path = f"datasets/processed_and_labeled_for_training/goodreads_reviews_substantiveness.csv"
df.to_csv(output_path, index=False)
print(f"Saved dataset: {output_path}")

Saved dataset: datasets/processed_and_labeled_for_training/goodreads_reviews_substantiveness.csv
