In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import mannwhitneyu
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_recall_curve
import numpy as np

In [3]:
corpus_df = pd.read_csv('../data/corpus/Science_Fiction_&_Fantasy.csv')
print(corpus_df.head())

         asin                                           document  label
0  B00D2WFL2U  but candles were scarcer than food then and i ...      1
1  B015NL50YQ  jellicoe his voice was clipped precise my coun...      1
2  B002V5CVJU  equivocal worshipped in theatre and boudoir sc...      0
3  B07R7432SV  although she'd been clean for two years after ...      1
4  B07B3YMKXD  in the early nineteen forties this little boy ...      0


# TF-IDF

## Rank Sum Approach

### Calculate TF-IDF score

In [4]:
# Extract documents and labels from the DataFrame
corpus = corpus_df['document']
labels = corpus_df['label']

# Calculate TF-IDF 
tfidf_vectorizer = TfidfVectorizer(max_features=1000, max_df=0.5, min_df=10, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
terms_tfidf = tfidf_vectorizer.get_feature_names_out()

# Create DataFrame for TF-IDF scores with labels
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=terms_tfidf)
tfidf_df['Genre'] = labels

In [5]:
tfidf_df

Unnamed: 0,ability,able,action,actually,added,age,ago,ahead,air,alien,...,written,wrong,yeah,year,years,yellow,yes,york,young,Genre
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.054711,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,1
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,1
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.051806,0.071916,0.000000,0.0,0.0,0.000000,0
3,0.0,0.0,0.0,0.0,0.070097,0.0,0.051583,0.000000,0.000000,0.000000,...,0.0,0.054104,0.0,0.000000,0.175044,0.000000,0.0,0.0,0.000000,1
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.066982,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,1
176,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.038709,0.000000,0.0,0.0,0.205438,0
177,0.0,0.0,0.0,0.0,0.000000,0.0,0.078431,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.102157,0.0,0.0,0.141252,1
178,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.068177,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.117065,0


### Assign & normalize words rank

In [6]:
# Function to assign ranks based on TF-IDF scores 
def assign_flipped_ranks(row):
    ranks = row.rank(method='min', ascending=False)  # Assign ranks (higher score gets lower rank)
    max_rank = ranks.max() 
    if max_rank > 0:
        ranks = (max_rank + 1 - ranks) / max_rank #normalize rank to 0-1 scale -> closer to 1 is better rank
    ranks[row == 0] = 0  # Assign rank 0 to words not present in the document
    return ranks

# Apply rank assignment
tfidf_ranks_df = tfidf_df.drop(columns=['Genre']).apply(assign_flipped_ranks, axis=1)
tfidf_ranks_df['Genre'] = tfidf_df['Genre']

### Aggregate words rank

In [7]:
# Separate DataFrame by genre
sf_ranks = tfidf_ranks_df[tfidf_ranks_df['Genre'] == 1].drop(columns=['Genre'])
non_sf_ranks = tfidf_ranks_df[tfidf_ranks_df['Genre'] == 0].drop(columns=['Genre'])

# Calculate sum rank for each word in each genre
sf_agg_ranks = sf_ranks.sum(axis=0).sort_values(ascending=False)
non_sf_agg_ranks = non_sf_ranks.sum(axis=0).sort_values(ascending=False)

# DataFrame for aggregated ranks
sf_agg_ranks_df = pd.DataFrame({'Word': sf_agg_ranks.index, 'SF Total Rank': sf_agg_ranks.values})
non_sf_agg_ranks_df = pd.DataFrame({'Word': non_sf_agg_ranks.index, 'Non-SF Total Rank': non_sf_agg_ranks.values})

agg_ranks_df = pd.merge(sf_agg_ranks_df, non_sf_agg_ranks_df, on='Word', how='outer')
agg_ranks_df['Difference'] = agg_ranks_df['SF Total Rank'] - agg_ranks_df['Non-SF Total Rank']

### Mann–Whitney U Test

In [8]:
# Initialize a list to store the results
p_values = []

# Perform the Mann–Whitney U Test for each word
for word in agg_ranks_df['Word']:
    # Extract the rank values for the word from the SF and Non-SF DataFrames
    sf_ranks_word = sf_ranks[word].values
    non_sf_ranks_word = non_sf_ranks[word].values

    # Perform the Mann–Whitney U Test
    stat, p_value = mannwhitneyu(sf_ranks_word, non_sf_ranks_word, alternative='two-sided')
    p_values.append(p_value)

# Append result
agg_ranks_df['P-Value'] = p_values

In [104]:
filter_list = agg_ranks_df[(agg_ranks_df['Difference'] > 3) & (agg_ranks_df['P-Value'] < 0.05)].sort_values(by='Difference', ascending=False).reset_index(drop=True)
filter_list

Unnamed: 0,Word,SF Total Rank,Non-SF Total Rank,Difference,P-Value
0,walls,11.094503,0.000000,11.094503,0.000005
1,light,12.777807,3.381029,9.396778,0.000584
2,dark,15.456163,6.455115,9.001048,0.004652
3,gods,8.555831,0.000000,8.555831,0.001197
4,shadow,8.536379,0.000000,8.536379,0.001197
...,...,...,...,...,...
67,voice,9.329313,5.174995,4.154318,0.021374
68,land,7.222127,3.220168,4.001959,0.023573
69,laughed,6.791881,2.949879,3.842002,0.044620
70,edge,6.834207,2.992428,3.841779,0.046654


In [523]:
filter_list['Difference'].mean()

5.89045446075564

# Logistic Regression Model

### Set up model

In [105]:
top_terms = filter_list['Word'].values

# Vectorize the original corpus
vectorizer = TfidfVectorizer(vocabulary=top_terms)  # Restrict vectorization to top terms
selected_features = vectorizer.fit_transform(corpus_df['document'])  # Apply vectorizer to original documents


# Cull out documents with all zero
zero_vector_docs = np.where(selected_features.toarray().sum(axis=1) == 0)[0]  # Find indices of documents with all-zero vectors

# Drop documents with all-zero vectorization
if len(zero_vector_docs) > 0:
    print(f"Dropping documents with all-zero vectorization (no matching terms in top list): {zero_vector_docs}")
    select_corpus_df = corpus_df.drop(index=zero_vector_docs).reset_index(drop=True)  # Drop and reset index
    selected_features = vectorizer.fit_transform(select_corpus_df['document'])  # Re-vectorize without dropped docs
else:
    print("No documents with all-zero vectorization found.")


# Create DataFrame with selected features (top terms)
selected_df = pd.DataFrame(selected_features.toarray(), columns=top_terms)

# Add genre labels 
selected_df['label'] = select_corpus_df['label']

# Prepare X and y for training
X = selected_df.drop(columns='label')
y = selected_df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=60)

# Check the balanced split
print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])
print("Class distribution in training set:", y_train.value_counts(normalize=True))
print("Class distribution in test set:", y_test.value_counts(normalize=True))

Dropping documents with all-zero vectorization (no matching terms in top list): [13 33 83 84 90]
Training set size: 122
Test set size: 53
Class distribution in training set: label
1    0.508197
0    0.491803
Name: proportion, dtype: float64
Class distribution in test set: label
1    0.509434
0    0.490566
Name: proportion, dtype: float64


### Train model

In [151]:
# Train a logistic regression model
logreg = LogisticRegression(max_iter=1000, C = 2)
logreg.fit(X_train, y_train)

# Make predictions
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=["Non-Science Fiction", "Science Fiction & Fantasy"]))

                           precision    recall  f1-score   support

      Non-Science Fiction       0.71      0.77      0.74        26
Science Fiction & Fantasy       0.76      0.70      0.73        27

                 accuracy                           0.74        53
                macro avg       0.74      0.74      0.74        53
             weighted avg       0.74      0.74      0.74        53



### Best Threshold

In [150]:
# Train the logistic regression model using the training set
final_model = LogisticRegression(max_iter=1000, random_state=42, C = 2)
final_model.fit(X_train, y_train)

# Predict probabilities for the test set
y_probs = final_model.predict_proba(X_test)[:, 1]  

# Calculate precision-recall pairs for different thresholds
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

# Choose optimized threshold 
best_threshold_index = np.argmax(2 * (precisions * recalls) / (precisions + recalls))
best_threshold = thresholds[best_threshold_index]
print("Best threshold:", best_threshold)

# Make predictions using the new threshold
y_pred_adj = (y_probs >= best_threshold).astype(int)

# Evaluate performance using the adjusted threshold
print("Adjusted Threshold Test Set Performance:")
print(classification_report(y_test, y_pred_adj, target_names=["Non-Science Fiction", "Science Fiction & Fantasy"]))


Best threshold: 0.4824012416738424
Adjusted Threshold Test Set Performance:
                           precision    recall  f1-score   support

      Non-Science Fiction       0.86      0.69      0.77        26
Science Fiction & Fantasy       0.75      0.89      0.81        27

                 accuracy                           0.79        53
                macro avg       0.80      0.79      0.79        53
             weighted avg       0.80      0.79      0.79        53



In [108]:
# Analyze feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Term': X.columns,
    'Coefficient': logreg.coef_[0]
})

feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False).reset_index(drop=True)

print("Top Predictive Terms for Science Fiction & Fantasy:")
print(feature_importance.head(20))

Top Predictive Terms for Science Fiction & Fantasy:
       Term  Coefficient
0      gods     1.257359
1      body     0.937182
2   watched     0.907494
3    joined     0.861240
4     magic     0.854315
5     metal     0.850970
6    shadow     0.834027
7     walls     0.775238
8      wind     0.736976
9    planet     0.698555
10     skin     0.694276
11     arms     0.680673
12    swept     0.652652
13      air     0.637564
14    flesh     0.592339
15     hurt     0.590782
16    enemy     0.586606
17    mouth     0.577916
18     blue     0.574546
19    sword     0.530381


## Predict Proba

In [179]:
# Load reserved test set
reserved_df = pd.read_csv('../data/corpus/reserved_test_set.csv')

original_reserved_df = reserved_df.copy()

In [180]:
# Vectorize the reserved test set
reserved_features = vectorizer.transform(reserved_df['document'])  # Apply trained vectorizer

# Cull out documents with all-zero vectorization
zero_vector_docs = np.where(reserved_features.toarray().sum(axis=1) == 0)[0]  

# Store ASINs of removed documents 
removed_asins = reserved_df.iloc[zero_vector_docs]['asin'].tolist()

# Drop documents with all-zero vectorization
if len(zero_vector_docs) > 0:
    print(f"Removing {len(zero_vector_docs)} documents with all-zero vectorization.")
    reserved_df = reserved_df.drop(index=zero_vector_docs).reset_index(drop=True)  # Drop and reset index
    #reserved_features = reserved_features[~np.isin(range(len(original_reserved_df)), zero_vector_docs)]  # Update features
    reserved_features = vectorizer.transform(reserved_df['document'])  # Re-vectorize remaining documents
else:
    print("No documents with all-zero vectorization found.")

#Predict probabilities using the final logistic regression model
reserved_probs = final_model.predict_proba(reserved_features)[:, 1]  # Probability for class 1 (main genre)

Removing 7 documents with all-zero vectorization.




In [181]:
# Add predictions to the reserved_df
main_genre = "Science_Fiction" 
reserved_df[f"{main_genre}_Prob"] = reserved_probs.round(2)

# Reinsert removed documents with placeholder values
original_reserved_df[f"{main_genre}_Prob"] = None  # Initialize the probability column with None
original_reserved_df.loc[
    original_reserved_df['asin'].isin(reserved_df['asin']), 
    f"{main_genre}_Prob"
] = reserved_df[f"{main_genre}_Prob"].values

# Save the updated DataFrame to CSV
original_reserved_df.to_csv('../data/corpus/reserved_test_pred.csv', index=False)

In [187]:
import matplotlib.pyplot as plt