In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import mannwhitneyu
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_recall_curve
import numpy as np

In [3]:
corpus_df = pd.read_csv('../data/corpus/History.csv')
print(corpus_df.head())

         asin                                           document  label
0  B00F8JE9ZA  our nation and the world breathed a sigh of re...      1
1  B07BN1459L  he took a far more active role in saving both ...      1
2  B07KL64VDH  chapter two identifying what you want the impo...      0
3  B00QFSRWWU  next time you're in the city so nice they name...      1
4  B002ZP8M44  i have been asked why i as a latina am writing...      0


# TF-IDF

## Rank Sum Approach

### Calculate TF-IDF score

In [4]:
# Extract documents and labels from the DataFrame
corpus = corpus_df['document']
labels = corpus_df['label']

# Calculate TF-IDF 
tfidf_vectorizer = TfidfVectorizer(max_features=1000, max_df=0.5, min_df=10, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
terms_tfidf = tfidf_vectorizer.get_feature_names_out()

# Create DataFrame for TF-IDF scores with labels
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=terms_tfidf)
tfidf_df['Genre'] = labels

In [5]:
tfidf_df

Unnamed: 0,able,according,act,action,actions,actually,added,advance,age,ago,...,written,wrong,wrote,yeah,year,years,yes,york,young,Genre
0,0.000000,0.049671,0.046961,0.099341,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.023704,0.000000,0.000000,0.036491,1
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.040958,0.000000,0.000000,0.000000,1
2,0.035199,0.000000,0.000000,0.000000,0.000000,0.039249,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.059613,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.033985,0.000000,0.252403,0.000000,1
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.037536,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.118209,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
176,0.000000,0.086250,0.163091,0.000000,0.000000,0.000000,0.086250,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.057957,0.000000,0.000000,0.000000,0.000000,0
177,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.192724,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
178,0.000000,0.000000,0.000000,0.000000,0.073845,0.000000,0.075305,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.080407,0.000000,0.000000,0.066727,0.000000,0.000000,0


### Assign & normalize words rank

In [6]:
# Function to assign ranks based on TF-IDF scores 
def assign_flipped_ranks(row):
    ranks = row.rank(method='min', ascending=False)  # Assign ranks (higher score gets lower rank)
    max_rank = ranks.max() 
    if max_rank > 0:
        ranks = (max_rank + 1 - ranks) / max_rank #normalize rank to 0-1 scale -> closer to 1 is better rank
    ranks[row == 0] = 0  # Assign rank 0 to words not present in the document
    return ranks

# Apply rank assignment
tfidf_ranks_df = tfidf_df.drop(columns=['Genre']).apply(assign_flipped_ranks, axis=1)
tfidf_ranks_df['Genre'] = tfidf_df['Genre']

### Aggregate words rank

In [7]:
# Separate DataFrame by genre
sf_ranks = tfidf_ranks_df[tfidf_ranks_df['Genre'] == 1].drop(columns=['Genre'])
non_sf_ranks = tfidf_ranks_df[tfidf_ranks_df['Genre'] == 0].drop(columns=['Genre'])

# Calculate sum rank for each word in each genre
sf_agg_ranks = sf_ranks.sum(axis=0).sort_values(ascending=False)
non_sf_agg_ranks = non_sf_ranks.sum(axis=0).sort_values(ascending=False)

# DataFrame for aggregated ranks
sf_agg_ranks_df = pd.DataFrame({'Word': sf_agg_ranks.index, 'SF Total Rank': sf_agg_ranks.values})
non_sf_agg_ranks_df = pd.DataFrame({'Word': non_sf_agg_ranks.index, 'Non-SF Total Rank': non_sf_agg_ranks.values})

agg_ranks_df = pd.merge(sf_agg_ranks_df, non_sf_agg_ranks_df, on='Word', how='outer')
agg_ranks_df['Difference'] = agg_ranks_df['SF Total Rank'] - agg_ranks_df['Non-SF Total Rank']

### Mann–Whitney U Test

In [8]:
# Initialize a list to store the results
p_values = []

# Perform the Mann–Whitney U Test for each word
for word in agg_ranks_df['Word']:
    # Extract the rank values for the word from the SF and Non-SF DataFrames
    sf_ranks_word = sf_ranks[word].values
    non_sf_ranks_word = non_sf_ranks[word].values

    # Perform the Mann–Whitney U Test
    stat, p_value = mannwhitneyu(sf_ranks_word, non_sf_ranks_word, alternative='two-sided')
    p_values.append(p_value)

# Append result
agg_ranks_df['P-Value'] = p_values

In [9]:
filter_list = agg_ranks_df[(agg_ranks_df['Difference'] > 4) & (agg_ranks_df['P-Value'] < 0.03)].sort_values(by='Difference', ascending=False).reset_index(drop=True)
filter_list

Unnamed: 0,Word,SF Total Rank,Non-SF Total Rank,Difference,P-Value
0,war,29.103534,4.812152,24.291382,6.258497e-09
1,history,20.002698,2.344067,17.658631,6.382283e-07
2,american,20.939631,3.661125,17.278506,1.322949e-05
3,great,21.192140,5.886474,15.305665,8.562058e-05
4,century,15.164339,1.086311,14.078027,2.817258e-06
...,...,...,...,...,...
110,vast,7.171570,2.019517,5.152053,1.572039e-02
111,ninety,8.906931,3.824966,5.081965,1.934698e-02
112,caused,6.412800,1.375403,5.037397,2.969431e-02
113,number,9.055500,4.225113,4.830388,9.739089e-03


In [39]:
filter_list['Difference'].mean()

7.240091802088993

# Logistic Regression Model

### Set up model

In [10]:
top_terms = filter_list['Word'].values

# Vectorize the original corpus
vectorizer = TfidfVectorizer(vocabulary=top_terms)  # Restrict vectorization to top terms
selected_features = vectorizer.fit_transform(corpus_df['document'])  # Apply vectorizer to original documents


# Call out documents with all zero
zero_vector_docs = np.where(selected_features.toarray().sum(axis=1) == 0)[0]  # Find indices of documents with all-zero vectors

# Drop documents with all-zero vectorization
if len(zero_vector_docs) > 0:
    print(f"Dropping documents with all-zero vectorization (no matching terms in top list): {zero_vector_docs}")
    select_corpus_df = corpus_df.drop(index=zero_vector_docs).reset_index(drop=True)  # Drop and reset index
    selected_features = vectorizer.fit_transform(select_corpus_df['document'])  # Re-vectorize without dropped docs
else:
    print("No documents with all-zero vectorization found.")


# Create DataFrame with selected features (top terms)
selected_df = pd.DataFrame(selected_features.toarray(), columns=top_terms)

# Add genre labels 
selected_df['label'] = select_corpus_df['label']

# Prepare X and y for training
X = selected_df.drop(columns='label')
y = selected_df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Check the balanced split
print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])
print("Class distribution in training set:", y_train.value_counts(normalize=True))
print("Class distribution in test set:", y_test.value_counts(normalize=True))

Dropping documents with all-zero vectorization (no matching terms in top list): [ 56 140]
Training set size: 124
Test set size: 54
Class distribution in training set: label
1    0.508065
0    0.491935
Name: proportion, dtype: float64
Class distribution in test set: label
0    0.5
1    0.5
Name: proportion, dtype: float64


### Train model

In [21]:
# Train a logistic regression model
logreg = LogisticRegression(max_iter=1000, C = 2)
logreg.fit(X_train, y_train)

# Make predictions
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=["Non-Science Fiction", "Science Fiction & Fantasy"]))

                           precision    recall  f1-score   support

      Non-Science Fiction       0.89      0.89      0.89        27
Science Fiction & Fantasy       0.89      0.89      0.89        27

                 accuracy                           0.89        54
                macro avg       0.89      0.89      0.89        54
             weighted avg       0.89      0.89      0.89        54



### Best Threshold

In [23]:
# Train the logistic regression model using the training set
final_model = LogisticRegression(max_iter=1000, random_state=42, C = 2)
final_model.fit(X_train, y_train)

# Predict probabilities for the test set
y_probs = final_model.predict_proba(X_test)[:, 1]  

# Calculate precision-recall pairs for different thresholds
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

# Choose optimized threshold 
best_threshold_index = np.argmax(2 * (precisions * recalls) / (precisions + recalls))
best_threshold = thresholds[best_threshold_index]
print("Best threshold:", best_threshold)

# Make predictions using the new threshold
y_pred_adj = (y_probs >= best_threshold).astype(int)

# Evaluate performance using the adjusted threshold
print("Adjusted Threshold Test Set Performance:")
print(classification_report(y_test, y_pred_adj, target_names=["Non-Science Fiction", "Science Fiction & Fantasy"]))

Best threshold: 0.4689019400101556
Adjusted Threshold Test Set Performance:
                           precision    recall  f1-score   support

      Non-Science Fiction       0.96      0.85      0.90        27
Science Fiction & Fantasy       0.87      0.96      0.91        27

                 accuracy                           0.91        54
                macro avg       0.91      0.91      0.91        54
             weighted avg       0.91      0.91      0.91        54



In [27]:
# Analyze feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Term': X.columns,
    'Coefficient': logreg.coef_[0]
})

feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False).reset_index(drop=True)

print("Top Predictive Terms for Science Fiction & Fantasy:")
print(feature_importance.head(20))

Top Predictive Terms for Science Fiction & Fantasy:
         Term  Coefficient
0      german     1.686211
1         war     1.612857
2       roman     1.558903
3        rome     1.453823
4      europe     1.428229
5      modern     1.400102
6      eighty     1.353663
7       south     1.283349
8    virginia     1.261024
9    eighteen     1.203458
10     nation     1.190980
11       west     1.185050
12     france     1.138338
13    western     1.106020
14      enemy     1.075418
15   directly     1.019639
16        lee     1.006930
17     empire     0.977718
18  americans     0.968687
19      april     0.953855


## Predict Proba

In [24]:
# Load reserved test set
reserved_df = pd.read_csv('../data/corpus/reserved_test_pred.csv')

original_reserved_df = reserved_df.copy()

In [25]:
# Vectorize the reserved test set
reserved_features = vectorizer.transform(reserved_df['document'])  # Apply trained vectorizer

# Cull out documents with all-zero vectorization
zero_vector_docs = np.where(reserved_features.toarray().sum(axis=1) == 0)[0]  

# Store ASINs of removed documents 
removed_asins = reserved_df.iloc[zero_vector_docs]['asin'].tolist()

# Drop documents with all-zero vectorization
if len(zero_vector_docs) > 0:
    print(f"Removing {len(zero_vector_docs)} documents with all-zero vectorization.")
    reserved_df = reserved_df.drop(index=zero_vector_docs).reset_index(drop=True)  # Drop and reset index
    reserved_features = vectorizer.transform(reserved_df['document'])  # Re-vectorize remaining documents
else:
    print("No documents with all-zero vectorization found.")

#Predict probabilities using the final logistic regression model
reserved_probs = final_model.predict_proba(reserved_features)[:, 1]  # Probability for class 1 (main genre)

Removing 2 documents with all-zero vectorization.




In [26]:
# Add predictions to the reserved_df
main_genre = "History" 
reserved_df[f"{main_genre}_Prob"] = reserved_probs.round(2)

# Reinsert removed documents with placeholder values
original_reserved_df[f"{main_genre}_Prob"] = None  # Initialize the probability column with None
original_reserved_df.loc[
    original_reserved_df['asin'].isin(reserved_df['asin']), 
    f"{main_genre}_Prob"
] = reserved_df[f"{main_genre}_Prob"].values

# Save the updated DataFrame to CSV
original_reserved_df.to_csv('../data/corpus/reserved_test_pred.csv', index=False)