In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import mannwhitneyu
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_recall_curve
import numpy as np

In [2]:
corpus_df = pd.read_csv('../data/corpus/Business_&_Careers.csv')
print(corpus_df.head())

         asin                                           document  label
0  B07MM6CM86  or that many individual contributors rely on t...      1
1  B01N7J3RQ6  country given that business and india has tend...      1
2  B00IPNN1XW  a lifetime is embedded it does not float free ...      0
3  B072KL8WX6  written by juliet adams f c i p d technical ed...      1
4  B00C3GZ8Y2  nowhere we chatted for awhile all i wanted to ...      0


# TF-IDF

## Rank Sum Approach

### Calculate TF-IDF score

In [3]:
# Extract documents and labels from the DataFrame
corpus = corpus_df['document']
labels = corpus_df['label']

# Calculate TF-IDF 
tfidf_vectorizer = TfidfVectorizer(max_features=1000, max_df=0.5, min_df=10, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
terms_tfidf = tfidf_vectorizer.get_feature_names_out()

# Create DataFrame for TF-IDF scores with labels
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=terms_tfidf)
tfidf_df['Genre'] = labels

In [4]:
tfidf_df

Unnamed: 0,ability,able,absolutely,account,achieve,act,action,actually,advantage,advice,...,writing,written,wrong,wrote,yeah,year,years,yes,young,Genre
0,0.000000,0.035576,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,...,0.000000,0.042452,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,1
1,0.000000,0.048405,0.000000,0.0,0.063921,0.00000,0.0,0.050046,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,1
2,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,...,0.159333,0.000000,0.000000,0.0,0.00000,0.000000,0.042907,0.074272,0.000000,0
3,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.061474,...,0.056594,0.056594,0.000000,0.0,0.00000,0.000000,0.091443,0.000000,0.000000,1
4,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.071503,...,0.065828,0.000000,0.000000,0.0,0.00000,0.101739,0.035454,0.000000,0.057036,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0.000000,0.000000,0.000000,0.0,0.000000,0.06491,0.0,0.055385,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,1
176,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.06678,0.000000,0.033112,0.000000,0.053269,0
177,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.055152,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,1
178,0.000000,0.000000,0.076328,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.120872,0.056168,0


### Assign & normalize words rank

In [5]:
# Function to assign ranks based on TF-IDF scores 
def assign_flipped_ranks(row):
    ranks = row.rank(method='min', ascending=False)  # Assign ranks (higher score gets lower rank)
    max_rank = ranks.max() 
    if max_rank > 0:
        ranks = (max_rank + 1 - ranks) / max_rank #normalize rank to 0-1 scale -> closer to 1 is better rank
    ranks[row == 0] = 0  # Assign rank 0 to words not present in the document
    return ranks

# Apply rank assignment
tfidf_ranks_df = tfidf_df.drop(columns=['Genre']).apply(assign_flipped_ranks, axis=1)
tfidf_ranks_df['Genre'] = tfidf_df['Genre']

### Aggregate words rank

In [6]:
# Separate DataFrame by genre
sf_ranks = tfidf_ranks_df[tfidf_ranks_df['Genre'] == 1].drop(columns=['Genre'])
non_sf_ranks = tfidf_ranks_df[tfidf_ranks_df['Genre'] == 0].drop(columns=['Genre'])

# Calculate sum rank for each word in each genre
sf_agg_ranks = sf_ranks.sum(axis=0).sort_values(ascending=False)
non_sf_agg_ranks = non_sf_ranks.sum(axis=0).sort_values(ascending=False)

# DataFrame for aggregated ranks
sf_agg_ranks_df = pd.DataFrame({'Word': sf_agg_ranks.index, 'SF Total Rank': sf_agg_ranks.values})
non_sf_agg_ranks_df = pd.DataFrame({'Word': non_sf_agg_ranks.index, 'Non-SF Total Rank': non_sf_agg_ranks.values})

agg_ranks_df = pd.merge(sf_agg_ranks_df, non_sf_agg_ranks_df, on='Word', how='outer')
agg_ranks_df['Difference'] = agg_ranks_df['SF Total Rank'] - agg_ranks_df['Non-SF Total Rank']

### Mann–Whitney U Test

In [7]:
# Initialize a list to store the results
p_values = []

# Perform the Mann–Whitney U Test for each word
for word in agg_ranks_df['Word']:
    # Extract the rank values for the word from the SF and Non-SF DataFrames
    sf_ranks_word = sf_ranks[word].values
    non_sf_ranks_word = non_sf_ranks[word].values

    # Perform the Mann–Whitney U Test
    stat, p_value = mannwhitneyu(sf_ranks_word, non_sf_ranks_word, alternative='two-sided')
    p_values.append(p_value)

# Append result
agg_ranks_df['P-Value'] = p_values

In [121]:
filter_list = agg_ranks_df[(agg_ranks_df['Difference'] > 5) & (agg_ranks_df['P-Value'] < 0.05)].sort_values(by='Difference', ascending=False).reset_index(drop=True)
filter_list

Unnamed: 0,Word,SF Total Rank,Non-SF Total Rank,Difference,P-Value
0,business,29.596785,2.244200,27.352585,3.612127e-11
1,book,25.883688,3.881947,22.001742,1.105147e-09
2,want,24.942569,6.531943,18.410627,1.395350e-03
3,companies,18.418359,1.011325,17.407035,3.929268e-09
4,company,18.883007,2.383675,16.499332,6.617022e-08
...,...,...,...,...,...
166,driving,6.426507,1.257387,5.169120,2.720250e-02
167,cases,6.809036,1.643788,5.165248,3.021557e-02
168,reach,7.000274,1.910331,5.089943,2.075244e-02
169,example,8.428997,3.370034,5.058963,3.262913e-03


In [9]:
filter_list['Difference'].mean() #can do > 10 or 12 but list too short

7.116505802783105

# Logistic Regression Model

### Set up model

In [122]:
top_terms = filter_list['Word'].head(100).values

# Vectorize the original corpus
vectorizer = TfidfVectorizer(vocabulary=top_terms)  # Restrict vectorization to top terms
selected_features = vectorizer.fit_transform(corpus_df['document'])  # Apply vectorizer to original documents


# Call out documents with all zero
zero_vector_docs = np.where(selected_features.toarray().sum(axis=1) == 0)[0]  # Find indices of documents with all-zero vectors

# Drop documents with all-zero vectorization
if len(zero_vector_docs) > 0:
    print(f"Dropping documents with all-zero vectorization (no matching terms in top list): {zero_vector_docs}")
    select_corpus_df = corpus_df.drop(index=zero_vector_docs).reset_index(drop=True)  # Drop and reset index
    selected_features = vectorizer.fit_transform(select_corpus_df['document'])  # Re-vectorize without dropped docs
else:
    print("No documents with all-zero vectorization found.")



# Create DataFrame with selected features (top terms)
selected_df = pd.DataFrame(selected_features.toarray(), columns=top_terms)

# Add genre labels 
selected_df['label'] = select_corpus_df['label']

# Prepare X and y for training
X = selected_df.drop(columns='label')
y = selected_df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=45)

# Check the balanced split
print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])
print("Class distribution in training set:", y_train.value_counts(normalize=True))
print("Class distribution in test set:", y_test.value_counts(normalize=True))

Dropping documents with all-zero vectorization (no matching terms in top list): [ 65 149]
Training set size: 124
Test set size: 54
Class distribution in training set: label
1    0.508065
0    0.491935
Name: proportion, dtype: float64
Class distribution in test set: label
1    0.5
0    0.5
Name: proportion, dtype: float64


### Train model

In [123]:
# Train a logistic regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Make predictions
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=["Non-Science Fiction", "Science Fiction & Fantasy"]))

                           precision    recall  f1-score   support

      Non-Science Fiction       1.00      0.89      0.94        27
Science Fiction & Fantasy       0.90      1.00      0.95        27

                 accuracy                           0.94        54
                macro avg       0.95      0.94      0.94        54
             weighted avg       0.95      0.94      0.94        54



### Best Threshold

In [132]:
# Train the logistic regression model using the training set
final_model = LogisticRegression(max_iter=1000, random_state=42)
final_model.fit(X_train, y_train)

# Predict probabilities for the test set
y_probs = final_model.predict_proba(X_test)[:, 1]  

# Calculate precision-recall pairs for different thresholds
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

# Choose optimized threshold 
best_threshold_index = np.argmax(2 * (precisions * recalls) / (precisions + recalls))
best_threshold = thresholds[best_threshold_index]
print("Best threshold:", best_threshold)

# Make predictions using the new threshold
y_pred_adj = (y_probs >= best_threshold).astype(int)

# Evaluate performance using the adjusted threshold
print("Adjusted Threshold Test Set Performance:")
print(classification_report(y_test, y_pred_adj, target_names=["Non-Science Fiction", "Science Fiction & Fantasy"]))

Best threshold: 0.5063867586882749
Adjusted Threshold Test Set Performance:
                           precision    recall  f1-score   support

      Non-Science Fiction       1.00      0.89      0.94        27
Science Fiction & Fantasy       0.90      1.00      0.95        27

                 accuracy                           0.94        54
                macro avg       0.95      0.94      0.94        54
             weighted avg       0.95      0.94      0.94        54



In [120]:
# Analyze feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Term': X.columns,
    'Coefficient': logreg.coef_[0]
})

feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False).reset_index(drop=True)

print("Top Predictive Terms for Science Fiction & Fantasy:")
print(feature_importance.head(20))

Top Predictive Terms for Science Fiction & Fantasy:
          Term  Coefficient
0   leadership     1.310379
1     business     1.283987
2        sales     0.982422
3    companies     0.876136
4   management     0.810843
5      leaders     0.800416
6      product     0.751723
7     creative     0.701459
8        learn     0.694222
9        topic     0.674340
10   marketing     0.653128
11     chapter     0.648998
12      number     0.648658
13     success     0.645157
14     purpose     0.598042
15     section     0.582516
16    customer     0.563541
17    examples     0.540824
18    industry     0.538454
19       value     0.525903


## Predict Proba

In [133]:
# Load reserved test set
reserved_df = pd.read_csv('../data/corpus/reserved_test_pred.csv')

original_reserved_df = reserved_df.copy()

In [134]:
# Vectorize the reserved test set
reserved_features = vectorizer.transform(reserved_df['document'])  # Apply trained vectorizer

# Cull out documents with all-zero vectorization
zero_vector_docs = np.where(reserved_features.toarray().sum(axis=1) == 0)[0]  

# Store ASINs of removed documents 
removed_asins = reserved_df.iloc[zero_vector_docs]['asin'].tolist()

# Drop documents with all-zero vectorization
if len(zero_vector_docs) > 0:
    print(f"Removing {len(zero_vector_docs)} documents with all-zero vectorization.")
    reserved_df = reserved_df.drop(index=zero_vector_docs).reset_index(drop=True)  # Drop and reset index
    reserved_features = vectorizer.transform(reserved_df['document'])  # Re-vectorize remaining documents
else:
    print("No documents with all-zero vectorization found.")

#Predict probabilities using the final logistic regression model
reserved_probs = final_model.predict_proba(reserved_features)[:, 1]  # Probability for class 1 (main genre)

No documents with all-zero vectorization found.




In [135]:
# Add predictions to the reserved_df
main_genre = "Business" 
reserved_df[f"{main_genre}_Prob"] = reserved_probs.round(2)

# Reinsert removed documents with placeholder values
original_reserved_df[f"{main_genre}_Prob"] = None  # Initialize the probability column with None
original_reserved_df.loc[
    original_reserved_df['asin'].isin(reserved_df['asin']), 
    f"{main_genre}_Prob"
] = reserved_df[f"{main_genre}_Prob"].values

# Save the updated DataFrame to CSV
original_reserved_df.to_csv('../data/corpus/reserved_test_pred.csv', index=False)