In [3]:
import sklearn
import pandas as pd
import nltk
import numpy as np
import spacy
import re
import cleanlab
from nltk.corpus import words, stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics
from tqdm.auto import tqdm
tqdm.pandas()  # Enable tqdm for pandas

In [12]:
#import data
df = pd.read_csv("rest_data_last.csv")
print(df.head())

  category                                               text
0     arts  rob delaney vir das galen hopper samson kayo g...
1     arts  andris nelsons conducts a joint concert of the...
2     arts  warner music group has brought on sherry tan t...
3     arts  adele will explore what she s been going throu...
4     arts  you are using an older browser version. please...


In [13]:
# Text Cleaning Step

import nltk
from nltk.corpus import words, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

nltk.download("punkt_tab")
nltk.download("words")
nltk.download("wordnet")
nltk.download("stopwords")

# Load resources
english_vocab = set(words.words())
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english")) | {"said", "year", "one", "also", "time", "like",
                                                "new", "people", "state", "june", "say", "get",
                                                "may", "many", "would", "day", "year", "two",
                                                "last", "three", "first", "home", "city", "jun",
                                                "could", "country", "county", "child", "care",
                                                "kayo", "guz"}

def lemmatize_text(text):
    # Step 1: Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Step 2: Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Step 3: Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Step 4: Convert to lowercase
    text = text.lower()
    # Step 5: Tokenize text into words
    tokens = word_tokenize(text)
    # Step 6: Remove non-alphabetical words
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Step 7: Remove all words contained in stop words (set and custom)
    tokens = [word for word in tokens if word not in stop_words]
    # Step 8: Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Step 9: Remove words that are less than 3 characters
    tokens = [word for word in tokens if len(word) >= 3]
    # Step 10: Only use what's in english vocabulary
    tokens = [word for word in tokens if word in english_vocab]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(lemmatize_text)
# Remove "other" category from dataset
df = df.query("category != 'other'")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# Code inherited from Karen N
# This code basically discovers falsely categorized rows and removes them
# Using cleanlab and CleanLearning

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["category_encoded"] = label_encoder.fit_transform(df["category"].values)

# Display the updated DataFrame to verify the new column
print(df.head())

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from cleanlab import Datalab
from cleanlab.internal.multilabel_utils import int2onehot, onehot2int
import matplotlib.pyplot as plt

text = df["clean_text"].tolist()

label_encoder = LabelEncoder()
# y = pd.Series(df["category_encoded"])
y = df["category_encoded"].values

# 2. TF-IDF vectorization
vectorizer = TfidfVectorizer(
    analyzer='word',
    token_pattern=r'(?u)\b\w\w+\b',
    lowercase=True,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8,
    sublinear_tf=True,
    norm='l2',
    smooth_idf=True
)
X = vectorizer.fit_transform(text)


# lab = Datalab(df, label_column='category', task='multilabel')
# lab.find_issues(pred_probs=pred_probs, features=features)
#
# lab.report()

from cleanlab.classification import CleanLearning
from sklearn.linear_model import LogisticRegression

# 4. Use CleanLearning

clf = LogisticRegression(max_iter=300)
cleanl = CleanLearning(clf=clf, cv_n_folds=3, seed=42)

cleanl.fit(X, y)

# 5. Identify potential label issues

ranked_indices = cleanl.find_label_issues(labels=y, X=X)

# ranked_indices = ranked_indices.flatten()  # or list(ranked_indices.ravel())
ranked_indices = ranked_indices.iloc[:, 0].to_numpy().flatten()

# 6. Inspect suspicious examples
df_suspects = df.loc[ranked_indices].copy()
print("Potential label issues:")
print(df_suspects)

# Create a new DataFrame without the suspect rows
df_clean = df.drop(index=df_suspects.index)

# Optional: Reset the index if needed
df_clean = df_clean.reset_index(drop=True)

print("Clean DataFrame:")
print(df_clean)

  category                                               text  \
0     arts  rob delaney vir das galen hopper samson kayo g...   
1     arts  andris nelsons conducts a joint concert of the...   
2     arts  warner music group has brought on sherry tan t...   
3     arts  adele will explore what she s been going throu...   
4     arts  you are using an older browser version. please...   

                                          clean_text  category_encoded  
0  rob hopper samson khan nick ross lee harry com...                 0  
1  nelson conduct joint concert boston symphony o...                 0  
2  warner music group brought sherry tan head mus...                 0  
3  explore going album set explore going album ol...                 0  
4  older browser version please use version best ...                 0  
Potential label issues:
     category                                               text  \
4        arts  you are using an older browser version. please...   
5        ar

In [15]:
# Feature Extraction
# (Did not perform as well as without it)
nltk.download("averaged_perceptron_tagger_eng")

def extract_pos(text):
  tokens = word_tokenize(text)
  pos_tags = nltk.pos_tag(tokens)
  return " ".join([word for word, tag in pos_tags if tag in ["NN", "NNS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "JJ", "RB", "IN"]])

df_clean["features"] = df_clean["clean_text"].apply(extract_pos)
print(df_clean)
df_clean.to_csv("data_features.csv", index=False)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


     category                                               text  \
0        arts  rob delaney vir das galen hopper samson kayo g...   
1        arts  andris nelsons conducts a joint concert of the...   
2        arts  warner music group has brought on sherry tan t...   
3        arts  adele will explore what she s been going throu...   
4        arts  performers include ellie diamond finalist in d...   
...       ...                                                ...   
3684  weather  orangeburg . this evening in orangeburg cloudy...   
3685  weather  national basketball association north american...   
3686  weather  las vegas ap the southwest continued to bake s...   
3687  weather  here is today s weather outlook for jun. 25 20...   
3688  weather  singapore about 10000 years ago sea levels in ...   

                                             clean_text  category_encoded  \
0     rob hopper samson khan nick ross lee harry com...                 0   
1     nelson conduct joint co

In [16]:
#assign columns to variables
X = df_clean["clean_text"]
Y = df_clean["category"]
print(Y.unique())

['arts' 'crime' 'disaster' 'economy' 'education' 'environmental' 'health'
 'humanInterest' 'labour' 'lifestyle' 'politics' 'religion' 'science'
 'social' 'sport' 'unrest' 'weather']


In [17]:
# Set up Vecotrizers

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.8, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [27]:
from ast import mod
#split data
x_train, x_test, y_train, y_test = train_test_split(X_tfidf, Y, test_size=0.2, random_state=42)

# Logistic Regression Model
log_model = LogisticRegression(C=0.5, max_iter=5000, class_weight="balanced", solver="saga")
log_model.fit(x_train, y_train)

# Support Vector Machine Model
svm_model = SVC(C=1.5, kernel="linear", class_weight="balanced", random_state=42)
svm_model.fit(x_train, y_train)

In [28]:
#test the model
y_pred_log = log_model.predict(x_test)
y_pred_svm = svm_model.predict(x_test)

In [29]:
#print test results
print("-" * 40)
print("Logistic Regression")
print("-" * 40)
print(metrics.classification_report(y_test, y_pred_log))
accuracy_log = log_model.score(x_test, y_test)
print("-" * 40)
print("LinearSVC")
print("-" * 40)
print(metrics.classification_report(y_test, y_pred_svm))
accuracy_rf = svm_model.score(x_test, y_test)
print("-" * 40)

----------------------------------------
Logistic Regression
----------------------------------------
               precision    recall  f1-score   support

         arts       0.68      0.65      0.67        52
        crime       0.80      0.84      0.82        61
     disaster       0.62      0.71      0.66        41
      economy       0.69      0.74      0.72        39
    education       0.89      0.87      0.88        54
environmental       0.78      0.82      0.80        44
       health       0.76      0.87      0.81        47
humanInterest       0.69      0.69      0.69        29
       labour       0.83      0.72      0.77        40
    lifestyle       0.67      0.79      0.72        43
     politics       0.78      0.60      0.68        48
     religion       0.57      0.48      0.52        33
      science       0.78      0.78      0.78        37
       social       0.73      0.42      0.53        38
        sport       0.76      0.88      0.81        50
       unrest    

In [None]:
#Test with string

import builtins

def predict_category(text):
    text_tfidf = tfidf_vectorizer.transform([text])
    pred = svm_model.predict(text_tfidf)[0]
    return pred

textin = builtins.input("Enter text: ")
print(predict_category(textin))


Enter text: we put in a text and it spits out a prediction
arts


In [None]:
# This code is unused
# Found falsely categorized rows based on each category's top 25 words
# Worked well, but not as well as cleanlab

checkWords = {
    'arts': [
        'music', 'show', 'event', 'series', 'park', 'group', 'best', 'take', 'year', 'star', 'back', 'life', 'million',
        'community', 'work', 'make', 'way', 'art', 'book', 'school', 'season', 'world', 'road', 'rock', 'know'
    ],
    'crime': [
        'police', 'court', 'right', 'law', 'case', 'officer', 'year', 'member', 'charge', 'crime', 'prison', 'old',
        'found', 'death', 'hearing', 'justice', 'security', 'federal', 'prosecutor', 'drug', 'group', 'told', 'report',
        'man', 'president'
    ],
    'disaster': [
        'fire', 'water', 'area', 'police', 'air', 'rescue', 'service', 'community', 'official', 'flight', 'year',
        'road', 'near', 'department', 'report', 'crew', 'help', 'car', 'according', 'ship', 'around', 'say', 'vehicle',
        'week', 'national'
    ],
    'economy': [
        'company', 'price', 'market', 'per', 'million', 'tax', 'store', 'product', 'share', 'year', 'business',
        'industry', 'billion', 'service', 'month', 'group', 'government', 'plan', 'rate', 'investment', 'cent',
        'report', 'since', 'statement', 'increase'
    ],
    'education': [
        'school', 'student', 'university', 'education', 'high', 'college', 'teacher', 'program', 'community',
        'learning', 'class', 'year', 'work', 'support', 'help', 'degree', 'need', 'member', 'board', 'government',
        'president', 'child', 'district', 'covid', 'service'
    ],
    'environmental': [
        'park', 'energy', 'water', 'waste', 'power', 'company', 'market', 'project', 'plant', 'group', 'area', 'north',
        'management', 'solar', 'plan', 'river', 'need', 'wind', 'industry', 'renewable', 'community', 'local', 'year',
        'system', 'world'
    ],
    'health': [
        'health', 'covid', 'patient', 'vaccine', 'hospital', 'market', 'case', 'medical', 'service', 'disease',
        'pandemic', 'week', 'food', 'company', 'help', 'system', 'month', 'report', 'public', 'need', 'virus',
        'global', 'work', 'well', 'government'
    ],
    'humanInterest': [
        'award', 'plant', 'question', 'team', 'size', 'winner', 'act', 'statement', 'back', 'winning', 'win', 'flower',
        'game', 'best', 'security', 'second', 'service', 'year', 'good', 'world', 'well', 'ownership', 'high',
        'beneficial', 'top'
    ],
    'labour': [
        'work', 'employee', 'worker', 'health', 'job', 'security', 'business', 'working', 'retirement', 'need',
        'pandemic', 'company', 'make', 'covid', 'plan', 'benefit', 'help', 'month', 'service', 'week', 'union',
        'social', 'market', 'office', 'employer'
    ],
    'lifestyle': [
        'product', 'travel', 'make', 'need', 'bridge', 'summer', 'life', 'beauty', 'business', 'customer', 'brand',
        'food', 'want', 'wedding', 'restaurant', 'way', 'month', 'take', 'work', 'service', 'back', 'million', 'come',
        'year', 'covid'
    ],
    'other': [
        'game', 'market', 'covid', 'school', 'government', 'week', 'team', 'rub', 'make', 'high', 'second', 'season',
        'report', 'company', 'business', 'pandemic', 'health', 'area', 'need', 'help', 'case', 'year', 'back',
        'service', 'contact'
    ],
    'politics': [
        'government', 'president', 'minister', 'security', 'data', 'public', 'service', 'right', 'official',
        'international', 'support', 'state', 'community', 'year', 'election', 'national', 'help', 'news', 'united',
        'plan', 'group', 'law', 'covid', 'house', 'pandemic'
    ],
    'religion': [
        'church', 'life', 'school', 'family', 'community', 'year', 'world', 'way', 'come', 'work', 'make', 'child',
        'say', 'covid', 'back', 'old', 'week', 'god', 'show', 'spiritual', 'right', 'see', 'government', 'well',
        'pandemic'
    ],
    'science': [
        'market', 'research', 'company', 'report', 'space', 'device', 'system', 'science', 'global', 'product', 'data',
        'technology', 'year', 'study', 'industry', 'work', 'make', 'medical', 'service', 'program', 'design',
        'business', 'restaurant', 'patient', 'life'
    ],
    'social': [
        'health', 'case', 'life', 'social', 'abortion', 'government', 'community', 'work', 'need', 'service', 'baby',
        'woman', 'covid', 'program', 'support', 'child', 'week', 'year', 'right', 'death', 'make', 'number', 'group',
        'month', 'family'
    ],
    'sport': [
        'team', 'game', 'season', 'match', 'run', 'player', 'back', 'second', 'world', 'final', 'win', 'coach', 'play',
        'league', 'championship', 'four', 'sport', 'tournament', 'best', 'right', 'open', 'round', 'event', 'fan',
        'year'
    ],
    'unrest': [
        'war', 'police', 'shooting', 'government', 'violence', 'force', 'attack', 'protest', 'say', 'gun', 'month',
        'official', 'group', 'military', 'weapon', 'year', 'place', 'president', 'part', 'right', 'come', 'old',
        'security', 'officer', 'world'
    ],
    'weather': [
        'climate', 'change', 'heat', 'weather', 'temperature', 'area', 'water', 'high', 'risk', 'carbon', 'emission',
        'degree', 'global', 'national', 'storm', 'service', 'warning', 'part', 'level', 'record', 'report', 'rain',
        'world', 'according', 'plan'
    ]
}


def check_word_match(row):
    category = row['category']
    words = set(row['text'].lower().split())  # Convert text to lowercase and split into words

    if category in checkWords:
        match_count = sum(1 for word in checkWords[category] if word in words)  # Count matches
        return match_count >= 3

    return False  # If category is not in the mapping, return False

# Apply function to create new column
df['matches'] = df.apply(check_word_match, axis=1)

def remove_false_entries(df):
  return df[df['matches'] == True]

df = remove_false_entries(df)

#This removes the other category completely
#df = df[df['category'] != 'other']

In [None]:
2#THIS CODE BLOCK COUNTS WORDS, USED TO FIND COMMON WORDS IN CATEGORIES
from collections import Counter
import pandas as pd

def count_common_words(df):
    category_word_counts = {}

    for category in df["category"].unique():
        # Get all text for this category
        category_text = " ".join(df[df["category"] == category]["clean_text"])

        # Tokenize and count word frequencies
        words = category_text.split()
        word_counts = Counter(words)

        # Store the word frequency dictionary
        category_word_counts[category] = dict(word_counts)

    return category_word_counts

# Get word counts per category
word_counts_by_category = count_common_words(df)

# Display the top 10 most common words for each category
for category, word_counts in word_counts_by_category.items():
    print(f"Category: {category}")
    print(dict(Counter(word_counts).most_common(10)))  # Show top 10 words
    print("-" * 40)

print(df)

Category: arts
{'show': 269, 'music': 250, 'event': 238, 'series': 205, 'park': 188, 'group': 163, 'take': 158, 'million': 158, 'community': 154, 'art': 151}
----------------------------------------
Category: crime
{'police': 553, 'court': 383, 'case': 255, 'law': 239, 'right': 222, 'officer': 222, 'year': 179, 'charge': 171, 'member': 170, 'prison': 170}
----------------------------------------
Category: disaster
{'fire': 590, 'water': 369, 'area': 225, 'police': 214, 'air': 202, 'rescue': 162, 'service': 157, 'community': 153, 'near': 136, 'official': 135}
----------------------------------------
Category: economy
{'company': 349, 'market': 245, 'price': 230, 'per': 220, 'million': 199, 'tax': 194, 'share': 163, 'store': 163, 'billion': 150, 'business': 147}
----------------------------------------
Category: education
{'student': 1319, 'school': 1294, 'university': 774, 'education': 521, 'high': 322, 'college': 302, 'teacher': 301, 'program': 299, 'community': 271, 'learning': 268}
-