In [66]:
import pandas as pd

df = pd.read_csv('rest_data.csv')
df.head()


Unnamed: 0,category,text
0,arts,you are using an older browser version. please...
1,arts,on 31 march two of classical music s most acco...
2,arts,bpt after a year of being locked away at home ...
3,arts,pilot uninjured plane hit sandbar while landin...
4,arts,colleen distin photo by facebook toronto sun ....


In [67]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer, word_tokenize
import re
# Download NLTK resources (run once)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Cleans and preprocesses the input text.
    """
    # Step 1: Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Step 2: Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Step 3: Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Step 4: Convert to lowercase
    text = text.lower()
    
    # Step 5: Tokenize text into words
    tokens = word_tokenize(text)
    
    # Step 6: Remove stopwords and non-English words
    tokens = [word for word in tokens if word not in stop_words and len(word) >=3]
    
    # Step 7: Lemmatize words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Step 8: Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

# Example usage
df = pd.read_csv("rest_data.csv")
df["cleaned_text"] = df["text"].apply(clean_text)

df.head()
    

    

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karennurlybekov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/karennurlybekov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/karennurlybekov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,category,text,cleaned_text
0,arts,you are using an older browser version. please...,using older browser version please use support...
1,arts,on 31 march two of classical music s most acco...,march two classical music accomplished well kn...
2,arts,bpt after a year of being locked away at home ...,bpt year locked away home world eager reopen e...
3,arts,pilot uninjured plane hit sandbar while landin...,pilot uninjured plane hit sandbar landing floa...
4,arts,colleen distin photo by facebook toronto sun ....,colleen distin photo facebook toronto sun lost...


In [68]:
import spacy
from nltk import pos_tag
from collections import Counter
import nltk
from tqdm import tqdm
tqdm.pandas()

# Load SpaCy model (disable unused components for speed)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

pos_tags = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON']

# Extract POS features
features = []
for text in df["cleaned_text"]:
    doc = nlp(text)
    total_tokens = len(doc)
    pos_counts = {tag: 0 for tag in pos_tags}
    for token in doc:
        pos = token.pos_
        if pos in pos_counts:
            pos_counts[pos] += 1
    if total_tokens > 0:
        for tag in pos_counts:
            pos_counts[tag] /= total_tokens
    features.append(pos_counts)

In [71]:
from scipy.sparse import csr_matrix
from numpy import hstack
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


# Encode labels
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["category"])
df = df.dropna(subset=['label_encoded'])
y = df["label_encoded"]


# Extract POS features (already a DataFrame)
X_pos = pd.DataFrame(features)
X_pos_sparse = csr_matrix(X_pos.values)  # Convert to sparse matrix

# TF-IDF features
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df["cleaned_text"])

# # Word embeddings (convert to sparse)
# nlp = spacy.load("en_core_web_lg", disable=["parser", "ner"])
# X_emb = np.array([nlp(text).vector for text in df["cleaned_text"]])
# X_emb_sparse = csr_matrix(X_emb)  # Convert to sparse

# # Combine all features
# X_combined = hstack([X_tfidf, X_pos_sparse,]) #X_emb_sparse

import scipy
X_combined = scipy.sparse.hstack([X_pos, X_tfidf])


In [None]:
from sklearn.utils import parallel_backend
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid
import numpy as np
from tqdm import tqdm


class tqdm_joblib:
    def __init__(self, total=None):
        self.pbar = None
        self.total = total
        
    def __enter__(self):
        self.pbar = tqdm(total=self.total)
        return self
    
    def __exit__(self, *args):
        self.pbar.close()
        
    def update(self, _):
        self.pbar.update(1)

# ---------------------------------------------------
# 2. Modified Grid Search
# ---------------------------------------------------
# Load your data here
# X_combined = ...
# y = ...

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
    "class_weight": ["balanced", None]
}

# Calculate total combinations
n_combinations = len(ParameterGrid(param_grid))
total_fits = n_combinations * 5  # cv=5

# Run grid search with progress bar
with tqdm_joblib(total=total_fits) as progress:
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=5,
        verbose=0,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

# ---------------------------------------------------
# 3. Evaluation with Threshold Filtering
# ---------------------------------------------------
best_clf = grid_search.best_estimator_

# Predict probabilities
probs = best_clf.predict_proba(X_test)
max_probs = np.max(probs, axis=1)
pred_labels = best_clf.classes_[np.argmax(probs, axis=1)]

# Set a confidence threshold
confidence_threshold = 0.5  # Adjust as needed

# Replace low-confidence predictions with "Unknown"
final_preds = [
    "Unknown" if max_probs[i] < confidence_threshold else pred_labels[i]
    for i in range(len(pred_labels))
]

# Convert y_test to a NumPy array to avoid index issues
y_test = y_test.values

# Evaluate ignoring "Unknown" predictions
valid_idx = [i for i in range(len(final_preds)) if final_preds[i] != "Unknown"]
filtered_y_test = y_test[valid_idx]  # Direct slicing
filtered_preds = np.array(final_preds)[valid_idx]

# Print overall accuracy and filtered accuracy
print(f"Original Accuracy: {accuracy_score(y_test, pred_labels):.2f}")
if valid_idx:
    print(f"Filtered Accuracy (excluding 'Unknown' cases): {accuracy_score(filtered_y_test, filtered_preds):.2f}")
else:
    print("All predictions were below the threshold, no valid predictions for evaluation.")

  0%|          | 0/180 [00:00<?, ?it/s]

In [26]:
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy="stratified").fit(X_train, y_train)
print("Baseline accuracy:", dummy.score(X_test, y_test))

Baseline accuracy: 0.04627006610009443


ValueError: y must have at least two dimensions for multi-output regression but has only one.