In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from helper_functions import load_data

# Load the text data from multiple files
text_data_list, labels = load_data('data')

# Combine the text data into a single list
text_data = np.concatenate(text_data_list)

# Create the pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('svd', TruncatedSVD(n_components=2)),
    ('classifier', RandomForestClassifier())
])

text_data_transformed = pipeline.fit_transform(text_data)
# Fit the pipeline to the text data
pipeline.fit(text_data, labels)

# Retrieve the feature names from the TfidfVectorizer
vectorizer = pipeline.named_steps['vectorizer']
features = vectorizer.get_feature_names()
text_data_transformed = vectorizer.transform(text_data)
num_features = text_data_transformed.shape[1]

# Split the data into training and test sets
text_data_train, text_data_test, labels_train, labels_test = train_test_split(
    text_data_transformed, labels, test_size=0.2, random_state=42
)

# Fit the pipeline to the training data and predict the test labels
pipeline.fit(text_data_train, labels_train)
predictions = pipeline.predict(text_data_test)

# Print the classification report and ROC-AUC score
print(classification_report(labels_test, predictions))
print("ROC-AUC:", roc_auc_score(labels_test, predictions))

# Plot the feature importances
classifier = pipeline.named_steps['classifier']
importances = classifier.feature_importances_
plt.bar(range(importances.shape[0]), importances)
plt.xlabel("feature index")
plt.ylabel("feature importance")
plt.show()
