In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import mannwhitneyu
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_recall_curve
import numpy as np

In [3]:
corpus_df = pd.read_csv('../data/corpus/Romance.csv')
print(corpus_df.head())

         asin                                           document  label
0  B01I5J99HW  pinching his bearded cheek his brown hair was ...      1
1  B012DWPABG  the blare of a car horn drag jake from a sound...      1
2  B002V1C1NK  screens and keyboards though that may sound di...      0
3  B07LF84VVR  what in the name of hell is on your head arriv...      1
4  0008322651  it's meals on silver trays we went there to me...      0


# TF-IDF

## Rank Sum Approach

### Calculate TF-IDF score

In [4]:
# Extract documents and labels from the DataFrame
corpus = corpus_df['document']
labels = corpus_df['label']

# Calculate TF-IDF 
tfidf_vectorizer = TfidfVectorizer(max_features=1000, max_df=0.5, min_df=10, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
terms_tfidf = tfidf_vectorizer.get_feature_names_out()

# Create DataFrame for TF-IDF scores with labels
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=terms_tfidf)
tfidf_df['Genre'] = labels

In [5]:
tfidf_df

Unnamed: 0,able,action,actual,actually,added,admit,afraid,age,ago,ahead,...,wouldn,writing,wrong,yeah,year,years,yes,york,young,Genre
0,0.000000,0.071464,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.051985,0.000000,0.000000,0.094968,0.0,0.000000,1
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,1
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.086782,0.000000,0.000000,0.050370,0.000000,0.000000,0.0,0.130739,0
3,0.000000,0.000000,0.000000,0.000000,0.071325,0.0,0.0,0.0,0.0,0.073977,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.051169,0.0,0.000000,1
4,0.059977,0.000000,0.077706,0.064093,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.063202,0.000000,0.000000,0.072992,0.000000,0.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0.065002,0.082402,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.055273,0.000000,0.000000,0.000000,0.195525,0.000000,0.000000,0.0,0.000000,1
176,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.085019,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.058807,0.0,0.000000,0
177,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.068926,0.0,0.000000,1
178,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.059438,0.048096,0.000000,0.0,0.000000,0


### Assign & normalize words rank

In [6]:
# Function to assign ranks based on TF-IDF scores 
def assign_flipped_ranks(row):
    ranks = row.rank(method='min', ascending=False)  # Assign ranks (higher score gets lower rank)
    max_rank = ranks.max() 
    if max_rank > 0:
        ranks = (max_rank + 1 - ranks) / max_rank #normalize rank to 0-1 scale -> closer to 1 is better rank
    ranks[row == 0] = 0  # Assign rank 0 to words not present in the document
    return ranks

# Apply rank assignment
tfidf_ranks_df = tfidf_df.drop(columns=['Genre']).apply(assign_flipped_ranks, axis=1)
tfidf_ranks_df['Genre'] = tfidf_df['Genre']

### Aggregate words rank

In [7]:
# Separate DataFrame by genre
sf_ranks = tfidf_ranks_df[tfidf_ranks_df['Genre'] == 1].drop(columns=['Genre'])
non_sf_ranks = tfidf_ranks_df[tfidf_ranks_df['Genre'] == 0].drop(columns=['Genre'])

# Calculate sum rank for each word in each genre
sf_agg_ranks = sf_ranks.sum(axis=0).sort_values(ascending=False)
non_sf_agg_ranks = non_sf_ranks.sum(axis=0).sort_values(ascending=False)

# DataFrame for aggregated ranks
sf_agg_ranks_df = pd.DataFrame({'Word': sf_agg_ranks.index, 'SF Total Rank': sf_agg_ranks.values})
non_sf_agg_ranks_df = pd.DataFrame({'Word': non_sf_agg_ranks.index, 'Non-SF Total Rank': non_sf_agg_ranks.values})

agg_ranks_df = pd.merge(sf_agg_ranks_df, non_sf_agg_ranks_df, on='Word', how='outer')
agg_ranks_df['Difference'] = agg_ranks_df['SF Total Rank'] - agg_ranks_df['Non-SF Total Rank']

### Mann–Whitney U Test

In [8]:
# Initialize a list to store the results
p_values = []

# Perform the Mann–Whitney U Test for each word
for word in agg_ranks_df['Word']:
    # Extract the rank values for the word from the SF and Non-SF DataFrames
    sf_ranks_word = sf_ranks[word].values
    non_sf_ranks_word = non_sf_ranks[word].values

    # Perform the Mann–Whitney U Test
    stat, p_value = mannwhitneyu(sf_ranks_word, non_sf_ranks_word, alternative='two-sided')
    p_values.append(p_value)

# Append result
agg_ranks_df['P-Value'] = p_values

In [9]:
filter_list = agg_ranks_df[(agg_ranks_df['Difference'] > 2) & (agg_ranks_df['P-Value'] < 0.009)].sort_values(by='Difference', ascending=False).reset_index(drop=True)
filter_list

Unnamed: 0,Word,SF Total Rank,Non-SF Total Rank,Difference,P-Value
0,eyes,33.959137,8.367844,25.591293,9.304057e-11
1,wasn,23.735778,6.481327,17.254451,1.015394e-06
2,hand,20.608794,5.714857,14.893937,1.654684e-06
3,face,22.536238,9.031123,13.505115,1.351366e-03
4,heart,15.429550,2.806908,12.622642,3.003597e-05
...,...,...,...,...,...
76,let,12.524833,7.446021,5.078812,3.388412e-03
77,gone,8.088036,3.078665,5.009371,3.279152e-03
78,voice,11.099826,6.716390,4.383436,2.900038e-03
79,ago,8.364998,5.163137,3.201861,7.203511e-03


In [9]:
filter_list['Difference'].mean()

6.431518026277569

# Logistic Regression Model

### Set up model

In [10]:
top_terms = filter_list['Word'].values

# Vectorize the original corpus
vectorizer = TfidfVectorizer(vocabulary=top_terms)  # Restrict vectorization to top terms
selected_features = vectorizer.fit_transform(corpus_df['document'])  # Apply vectorizer to original documents


# Call out documents with all zero
zero_vector_docs = np.where(selected_features.toarray().sum(axis=1) == 0)[0]  # Find indices of documents with all-zero vectors

# Drop documents with all-zero vectorization
if len(zero_vector_docs) > 0:
    print(f"Dropping documents with all-zero vectorization (no matching terms in top list): {zero_vector_docs}")
    select_corpus_df = corpus_df.drop(index=zero_vector_docs).reset_index(drop=True)  # Drop and reset index
    selected_features = vectorizer.fit_transform(select_corpus_df['document'])  # Re-vectorize without dropped docs
else:
    print("No documents with all-zero vectorization found.")

# Create DataFrame with selected features (top terms)
selected_df = pd.DataFrame(selected_features.toarray(), columns=top_terms)

# Add genre labels 
selected_df['label'] = select_corpus_df['label']

# Prepare X and y for training
X = selected_df.drop(columns='label')
y = selected_df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=55)

# Check the balanced split
print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])
print("Class distribution in training set:", y_train.value_counts(normalize=True))
print("Class distribution in test set:", y_test.value_counts(normalize=True))

Dropping documents with all-zero vectorization (no matching terms in top list): [114 151]
Training set size: 124
Test set size: 54
Class distribution in training set: label
0    0.5
1    0.5
Name: proportion, dtype: float64
Class distribution in test set: label
1    0.5
0    0.5
Name: proportion, dtype: float64


### Train model

In [29]:
# Train a logistic regression model
logreg = LogisticRegression(max_iter=1000, C = 3)
logreg.fit(X_train, y_train)

# Make predictions
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=["Non-Science Fiction", "Science Fiction & Fantasy"]))

                           precision    recall  f1-score   support

      Non-Science Fiction       0.77      0.89      0.83        27
Science Fiction & Fantasy       0.87      0.74      0.80        27

                 accuracy                           0.81        54
                macro avg       0.82      0.81      0.81        54
             weighted avg       0.82      0.81      0.81        54



In [26]:
# Train the logistic regression model using the training set
final_model = LogisticRegression(max_iter=1000, random_state=42, C = 3)
final_model.fit(X_train, y_train)

# Predict probabilities for the test set
y_probs = final_model.predict_proba(X_test)[:, 1]  

# Calculate precision-recall pairs for different thresholds
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

# Choose optimized threshold 
best_threshold_index = np.argmax(2 * (precisions * recalls) / (precisions + recalls))
best_threshold = thresholds[best_threshold_index]
print("Best threshold:", best_threshold)

# Make predictions using the new threshold
y_pred_adj = (y_probs >= best_threshold).astype(int)

# Evaluate performance using the adjusted threshold
print("Adjusted Threshold Test Set Performance:")
print(classification_report(y_test, y_pred_adj, target_names=["Non-Science Fiction", "Science Fiction & Fantasy"]))

Best threshold: 0.45494394726628934
Adjusted Threshold Test Set Performance:
                           precision    recall  f1-score   support

      Non-Science Fiction       0.86      0.89      0.87        27
Science Fiction & Fantasy       0.88      0.85      0.87        27

                 accuracy                           0.87        54
                macro avg       0.87      0.87      0.87        54
             weighted avg       0.87      0.87      0.87        54



In [30]:
# Analyze feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Term': X.columns,
    'Coefficient': logreg.coef_[0]
})

feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False).reset_index(drop=True)

print("Top Predictive Terms for Science Fiction & Fantasy:")
print(feature_importance.head(20))

Top Predictive Terms for Science Fiction & Fantasy:
          Term  Coefficient
0         eyes     2.219343
1         hell     2.197138
2         love     2.174293
3      laughed     1.763326
4         arms     1.466400
5        shook     1.418833
6         wasn     1.396065
7        tears     1.310844
8      sitting     1.306171
9   expression     1.296119
10        tell     1.269924
11      opened     1.268675
12         hey     1.249319
13        kept     1.245219
14         eye     1.239224
15      tongue     1.210864
16        kiss     1.194850
17        yeah     1.179498
18       happy     1.174306
19          oh     1.150359


## Predict Proba

In [31]:
# Load reserved test set
reserved_df = pd.read_csv('../data/corpus/reserved_test_pred.csv')

original_reserved_df = reserved_df.copy()

In [32]:
# Vectorize the reserved test set
reserved_features = vectorizer.transform(reserved_df['document'])  # Apply trained vectorizer

# Cull out documents with all-zero vectorization
zero_vector_docs = np.where(reserved_features.toarray().sum(axis=1) == 0)[0]  

# Store ASINs of removed documents 
removed_asins = reserved_df.iloc[zero_vector_docs]['asin'].tolist()

# Drop documents with all-zero vectorization
if len(zero_vector_docs) > 0:
    print(f"Removing {len(zero_vector_docs)} documents with all-zero vectorization.")
    reserved_df = reserved_df.drop(index=zero_vector_docs).reset_index(drop=True)  # Drop and reset index
    reserved_features = vectorizer.transform(reserved_df['document'])  # Re-vectorize remaining documents
else:
    print("No documents with all-zero vectorization found.")

#Predict probabilities using the final logistic regression model
reserved_probs = final_model.predict_proba(reserved_features)[:, 1]  # Probability for class 1 (main genre)

Removing 1 documents with all-zero vectorization.




In [33]:
# Add predictions to the reserved_df
main_genre = "Romance" 
reserved_df[f"{main_genre}_Prob"] = reserved_probs.round(2)

# Reinsert removed documents with placeholder values
original_reserved_df[f"{main_genre}_Prob"] = None  # Initialize the probability column with None
original_reserved_df.loc[
    original_reserved_df['asin'].isin(reserved_df['asin']), 
    f"{main_genre}_Prob"
] = reserved_df[f"{main_genre}_Prob"].values

# Save the updated DataFrame to CSV
original_reserved_df.to_csv('../data/corpus/reserved_test_pred.csv', index=False)