In [1]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import spacy
import os
from helper_functions import load_data
from scipy.sparse import csr_matrix
nlp = spacy.load("en_core_web_lg")

def calculate_features(text_data):
    sentence_counts = []
    word_counts = []
    average_word_lengths = []
    average_sentence_lengths = []
    
    for text in text_data:
        doc = nlp(text)
        
        sentence_count = len([sent for sent in doc.sents])
        word_count = len([token for token in doc])
        
        average_word_length = sum(len(token) for token in doc) / word_count
        average_sentence_length = word_count / sentence_count
        
        sentence_counts.append(sentence_count)
        word_counts.append(word_count)
        average_word_lengths.append(average_word_length)
        average_sentence_lengths.append(average_sentence_length)
    
    return np.concatenate((sentence_counts, word_counts, average_word_lengths, average_sentence_lengths), axis=1)

text_data, labels = load_data('data')
text_data_features = calculate_features(text_data)
text_data_features_sparse = csr_matrix(text_data_features)

text_data_train, text_data_test, labels_train, labels_test = train_test_split(
    text_data_features, labels, test_size=0.2, random_state=42
)

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('svd', TruncatedSVD(n_components=2)),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(text_data_train, labels_train)

score = pipeline.score(text_data_test, labels_test)
print("Test accuracy:", score)

predictions = pipeline.predict(text_data_test)

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
print(classification_report(labels_test, predictions))
print("ROC-AUC:", roc_auc_score(labels_test, predictions))

classifier = pipeline.named_steps['classifier']
importances = classifier.feature_importances_
plt.bar(range(importances.shape[0]), importances)
plt.show()


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Processing file: fitzgerald_all_the_sad.txt
Processing file: fitzgerald_beautiful_and_damned.txt
Processing file: fitzgerald_flappers_and_philosophers.txt
Processing file: fitzgerald_gatsby.txt
Processing file: fitzgerald_tales_jazz_age.txt
Processing file: fitzgerald_tender_is.txt
Processing file: fitzgerald_the_vegtable.txt
Processing file: fitzgerald_this_side.txt
Processing file: hemingway_across_the_river.txt
Processing file: hemingway_bell_tolls.txt
Processing file: hemingway_farewell.txt
Processing file: hemingway_green_hills_africa.txt
Processing file: hemingway_in_our_time.txt
Processing file: hemingway_men_without_women.txt
Processing file: hemingway_old_man.txt
Processing file: hemingway_sun_also.txt
Processing file: hemingway_three_stories_ten_poems.txt
Processing file: hemingway_winner_take_nothing.txt
Processing file: fitzgerald_all_the_sad.txt
Processing file: fitzgerald_beautiful_and_damned.txt
Processing file: fitzgerald_flappers_and_philosophers.txt
Processing file: f

MemoryError: Unable to allocate 81.2 MiB for an array with shape (221816, 96) and data type float32