In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
from helper_functions import load_data
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import spacy
nlp = spacy.load("en_core_web_lg")

def count_chars(text):
    return len(text)

def count_words(text):
    return len(text.split())

def count_capital_words(text):
    return sum(map(str.isupper, text.split()))

def count_sent(text):
    return len(nltk.sent_tokenize(text))

def count_unique_words(text):
    return len(set(text.split()))

def count_stopwords(text):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)

def preprocess(sent):
    sent = sent.lower() # lower case
    sent = re.sub('\s+', ' ', sent) # remove double spacing
    sent = re.sub('([0-9]+)', '', sent) # remove numbers
    sent_token_list = [word for word in sent.split(' ')]
    sent = ' '.join(sent_token_list)
    return sent

def count_direct_objects(text):
    print("Processing ...")
    doc_text = nlp(text)
    direct_object_counts = 0
    print("POS tagging and counting direct objects ...")
    for chunk in doc_text.noun_chunks:
        if chunk.root.dep_ == 'dobj':
            direct_object_counts += 1
    return(direct_object_counts)

# Load the data
text_data, labels = load_data('data')
df = pd.DataFrame(list(zip(text_data, labels)), columns=['text_data', 'label'])

# Split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2)

'''
for idx, row in train_df.iterrows():
    train_df.loc[idx, 'chars'] = count_chars(row['text_data'])
    train_df.loc[idx, 'words'] = count_words(row['text_data'])
    train_df.loc[idx, 'capital_words'] = count_capital_words(row['text_data'])
    train_df.loc[idx, 'sentence_count'] = count_sent(row['text_data'])
    train_df.loc[idx, 'unique_words'] = count_unique_words(row['text_data'])
    train_df.loc[idx, 'stopwords_count'] = count_stopwords(row['text_data'])
    train_df.loc[idx, 'direct_objects_count'] = count_direct_objects(row['text_data'])

for idx, row in test_df.iterrows():
    test_df.loc[idx, 'chars'] = count_chars(row['text_data'])
    test_df.loc[idx, 'words'] = count_words(row['text_data'])
    test_df.loc[idx, 'capital_words'] = count_capital_words(row['text_data'])
    test_df.loc[idx, 'sentence_count'] = count_sent(row['text_data'])
    test_df.loc[idx, 'unique_words'] = count_unique_words(row['text_data'])
    test_df.loc[idx, 'stopwords_count'] = count_stopwords(row['text_data'])
    test_df.loc[idx, 'direct_objects_count'] = count_direct_objects(row['text_data']) 
'''

# add engineered features with numerical indices
train_df['feat_0'] = train_df['text_data'].apply(count_chars)
train_df['feat_1'] = train_df['text_data'].apply(count_words)
train_df['feat_2'] = train_df['text_data'].apply(count_capital_words)
train_df['feat_3'] = train_df['text_data'].apply(count_sent)
train_df['feat_4'] = train_df['text_data'].apply(count_unique_words)
train_df['feat_5'] = train_df['text_data'].apply(count_stopwords)
train_df['feat_6'] = train_df['text_data'].apply(count_direct_objects)

test_df['feat_0'] = test_df['text_data'].apply(count_chars)
test_df['feat_1'] = test_df['text_data'].apply(count_words)
test_df['feat_2'] = test_df['text_data'].apply(count_capital_words)
test_df['feat_3'] = test_df['text_data'].apply(count_sent)
test_df['feat_4'] = test_df['text_data'].apply(count_unique_words)
test_df['feat_5'] = test_df['text_data'].apply(count_stopwords)
test_df['feat_6'] = test_df['text_data'].apply(count_direct_objects)

# Create a TfidfVectorizer object and fit it on the training data
vectorizer = TfidfVectorizer()
train_tf_idf = vectorizer.fit_transform(train_df['text_data']).toarray()

# Apply the same vectorizer to the test data
test_tf_idf = vectorizer.transform(test_df['text_data']).toarray()
# Combine the tf-idf features with other engineered features
engineered_features = ['feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6']
train_X = np.concatenate([train_tf_idf, train_df[engineered_features].values], axis=1)
test_X = np.concatenate([test_tf_idf, test_df[engineered_features].values], axis=1)

'''
# Combine the tf-idf features with other engineered features
train_X = np.concatenate([train_tf_idf, train_df[['chars', 'words', 'capital_words', 'sentence_count', 'unique_words', 'stopwords_count', 'direct_objects_count']].values], axis=1)
test_X = np.concatenate([test_tf_idf, test_df[['chars', 'words', 'capital_words', 'sentence_count', 'unique_words', 'stopwords_count', 'direct_objects_count']].values], axis=1)
'''

# Extract the labels for the training and test sets
train_Y = train_df['label']
test_Y = test_df['label']

# Train a random forest classifier
rf = RandomForestClassifier()
rf.fit(train_X, train_Y)

# Predict the labels for the test set
pred_Y = rf.predict(test_X)

# Print the classification report and confusion matrix
print(classification_report(test_Y, pred_Y))
print(confusion_matrix(test_Y, pred_Y))

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Processing file: fitzgerald_all_the_sad.txt
Processing file: fitzgerald_bablyon_revisited.txt
Processing file: fitzgerald_beautiful_and_damned.txt
Processing file: fitzgerald_flappers_and_philosophers.txt
Processing file: fitzgerald_gatsby.txt
Processing file: fitzgerald_tales_jazz_age.txt
Processing file: fitzgerald_tender_is.txt
Processing file: fitzgerald_the_vegtable.txt
Processing file: fitzgerald_this_side.txt
Processing file: hemingway_across_the_river.txt
Processing file: hemingway_bell_tolls.txt
Processing file: hemingway_farewell.txt
Processing file: hemingway_green_hills_africa.txt
Processing file: hemingway_in_our_time.txt
Processing file: hemingway_men_without_women.txt
Processing file: hemingway_old_man.txt
Processing file: hemingway_sun_also.txt
Processing file: hemingway_three_stories_ten_poems.txt
Processing file: hemingway_winner_take_nothing.txt


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# assume cm is the confusion matrix
cm = confusion_matrix(test_Y, pred_Y)

# create a list of class labels
classes = ['fitzgerald', 'hemingway']

# plot the confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)

# add axis labels and title
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
importances = rf.feature_importances_

# Sort the features by their importance
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(train_tf_idf.shape[1]):
    print(f"{f+1}. feature {indices[f]} ({importances[indices[f]]})")

In [None]:
def get_top_n_features(feature_importances, feature_names, n):
    # Create a list of tuples with feature names and importance scores
    features = list(zip(feature_names, feature_importances))

    # Sort the list by importance score in descending order
    features.sort(key=lambda x: x[1], reverse=True)

    # Return the top n features
    return features[:n]

top_features = get_top_n_features(rf.feature_importances_, vectorizer.get_feature_names_out(), n=25)
print(top_features)

In [None]:
feature_names = vectorizer.get_feature_names_out()

# Create a mapping between feature indices and feature names
feature_mapping = {}
for feature_index in range(len(feature_names)):
    feature_mapping[feature_index] = feature_names[feature_index]

# Print out the feature mapping
print(feature_mapping)

In [None]:
feature_importances = rf.feature_importances_
feature_importance_dict = {}
desired_features = ['feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6']

for feature, importance in zip(desired_features, feature_importances):
    feature_importance_dict[feature] = importance

# Print the feature importances
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# predict probabilities for test set
probs = rf.predict_proba(test_X)[:, 1]

# calculate ROC curve
fpr, tpr, thresholds = roc_curve(test_Y, probs)

# calculate AUC score
auc = roc_auc_score(test_Y, probs)

# plot ROC curve
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (AUC = {:.3f})'.format(auc))
plt.show()