In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from imblearn.pipeline import Pipeline as IMBPipeline
from imblearn.over_sampling import SMOTE

# Load data
data = pd.read_excel(r"C:\Users\HEEMA SAMEERA\OneDrive\Desktop\newsarticleswithcategories.xlsx")

# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    tokens = text.split()  # Tokenization
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Remove stop words and lemmatize
    return ' '.join(tokens)

data['text'] = data['text'].apply(preprocess)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_features = vectorizer.fit_transform(data['text'])

# Sentiment analysis using NLTK VADER
analyzer = SentimentIntensityAnalyzer()

def sentiment_analysis(text):
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores

# Apply sentiment analysis
data['sentiment'] = data['text'].apply(sentiment_analysis)

# Convert sentiment scores to separate columns
data = data.join(pd.json_normalize(data['sentiment']))

# Drop the original 'sentiment' column
data = data.drop(columns=['sentiment'])

# Combine TF-IDF features with sentiment features
sentiment_features = data[['neg', 'neu', 'pos', 'compound']].values
X = np.hstack((tfidf_features.toarray(), sentiment_features))
y = data['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define pipeline
pipeline = IMBPipeline([
    ('sampling', SMOTE(random_state=42)),  # Oversampling using SMOTE
    ('classifier', RandomForestClassifier(random_state=42))  # Random Forest classifier
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Perform Randomized Search for hyperparameter tuning
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=20, cv=5, scoring='f1_weighted', random_state=42)
random_search.fit(X_train, y_train)

# Print best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# Generate the classification report
y_pred = random_search.predict(X_test)
print(classification_report(y_test, y_pred))

# Sample text to test
sample_text = "Religion is a basic good for all human beings everywhere, therefore religious freedom is a universal human right. It is neither unfair nor parochial, but a requirement of justice."

# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([[sample_sentiment['neg'], sample_sentiment['neu'], sample_sentiment['pos'], sample_sentiment['compound']]])

# Combine TF-IDF features with sentiment features for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features))

# Predict the label of the sample text
predicted_label = random_search.predict(sample_features)[0]
print("Predicted Label:", predicted_label)


Best Parameters: {'classifier__n_estimators': 200, 'classifier__min_samples_split': 2, 'classifier__max_depth': 30}
Best Score: 0.8784806994743569
                     precision    recall  f1-score   support

ANTI GOVERNMENT ACT       0.86      0.86      0.86         7
              ARSON       1.00      1.00      1.00         7
 COMMUNAL/RELIGIOUS       0.50      0.57      0.53         7
        CYBER CRIME       0.67      0.67      0.67         3
              JAILS       1.00      0.67      0.80         6
              MAFIA       1.00      0.75      0.86         4
             MURDER       0.60      0.75      0.67         4
               NDPS       0.83      1.00      0.91         5
               RAPE       0.67      1.00      0.80         2
          TERRORISM       1.00      0.75      0.86         8
      THEFT/ROBBERY       0.67      1.00      0.80         2

           accuracy                           0.80        55
          macro avg       0.80      0.82      0.80        

In [3]:
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[6 0 1 0 0 0 0 0 0 0 0]
 [0 7 0 0 0 0 0 0 0 0 0]
 [1 0 4 1 0 0 1 0 0 0 0]
 [0 0 1 2 0 0 0 0 0 0 0]
 [0 0 0 0 4 0 1 0 1 0 0]
 [0 0 0 0 0 3 0 1 0 0 0]
 [0 0 0 0 0 0 3 0 0 0 1]
 [0 0 0 0 0 0 0 5 0 0 0]
 [0 0 0 0 0 0 0 0 2 0 0]
 [0 0 2 0 0 0 0 0 0 6 0]
 [0 0 0 0 0 0 0 0 0 0 2]]


In [4]:
# Sample text to test
sample_text = "Cyber attack everywhere."
# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([[sample_sentiment['neg'], sample_sentiment['neu'], sample_sentiment['pos'], sample_sentiment['compound']]])

# Combine TF-IDF features with sentiment features for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features))

# Predict the label of the sample text
predicted_label = random_search.predict(sample_features)[0]
print("Predicted Label:", predicted_label)

Predicted Label: CYBER CRIME


In [5]:
# Sample text to test
sample_text = "18 year old gang raped in hyd "
# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([[sample_sentiment['neg'], sample_sentiment['neu'], sample_sentiment['pos'], sample_sentiment['compound']]])

# Combine TF-IDF features with sentiment features for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features))

# Predict the label of the sample text
predicted_label = random_search.predict(sample_features)[0]
print("Predicted Label:", predicted_label)

Predicted Label: RAPE


In [6]:
# Sample text to test
sample_text = "knife kills uncle "
# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([[sample_sentiment['neg'], sample_sentiment['neu'], sample_sentiment['pos'], sample_sentiment['compound']]])

# Combine TF-IDF features with sentiment features for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features))

# Predict the label of the sample text
predicted_label = random_search.predict(sample_features)[0]
print("Predicted Label:", predicted_label)

Predicted Label: MURDER


In [7]:
# Sample text to test
sample_text = "drugs found in the hitech city tunnel along with 12 other weapons "
# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([[sample_sentiment['neg'], sample_sentiment['neu'], sample_sentiment['pos'], sample_sentiment['compound']]])

# Combine TF-IDF features with sentiment features for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features))

# Predict the label of the sample text
predicted_label = random_search.predict(sample_features)[0]
print("Predicted Label:", predicted_label)

Predicted Label: NDPS


In [8]:
# Sample text to test
sample_text = """
An Israeli strike on a school sheltering displaced Palestinians in central Gaza killed at least 33 people on Thursday. Following the attack, the Israel Defense Forces said Hamas terrorists were operating from within the school. Meanwhile, Qatar Foreign Ministry has said that Hamas has not yet handed mediators its response to the latest ceasefire proposal and is still studying it. Amid the ongoing war, Israeli Prime Minister Benjamin Netanyahu is set to address a joint meeting of Congress on July 24, The Associated Press reported, citing two people familiar with the matter. The war began after Hamas attacked Israel on October 7, killing around 1,200 people and capturing more than 250 hostages.At least 33 people were killed after Israel hit a Gaza school on Thursday. Video footage showed Palestinians hauling away bodies and scores of injured in a hospital after the attack, which took place at a sensitive moment in mediated talks on a ceasefire that would involve releasing hostages held by Hamas and some of the Palestinians held in Israeli jails. Following the strike, the Israeli military said the central Gaza compound was being used by terrorists. Addressing a press conference, an IDF spokesperson said those targeted were members of Hamas's elite Nukhba force and of the Palestinian Islamic Jihad terror group who "directed terror attacks from the area of the school while exploiting it as a civilian location and as a shelter. The terrorists inside this school were planning more attacks against Israelis, some of them imminent." Meanwhile, the US State Department has said the country has been in contact with Israel about the strike at Gaza school. State Department spokesperson Matthew Miller also said that Washington expects Israel to be fully transparent in making information about the strike public. On Thursday, Qatari foreign ministry spokesperson Majed Al-Ansari said Hamas has not yet handed mediators its response to the latest ceasefire proposal and is still studying it, adding that Qatari, Egyptian and US mediators were still making efforts. Talks began on Wednesday when Central Intelligence Agency (CIA) director William Burns met senior officials from Qatar and Egypt in Doha to discuss a proposal that US President Joe Biden publicly endorsed last week. Russia and China, which hold veto powers in the United Nations Security Council, raised concerns on Thursday with a US draft resolution that would back a proposal - outlined by President Joe Biden - for a ceasefire between Israel and Hamas. The council's only Arab member, Algeria, also signalled it was not ready to back the text, diplomats said. A resolution needs at least nine votes in favour and no vetoes by the US, France, Britain, China or Russia to pass. Benjamin Netanyahu is set to address a joint meeting of Congress on July 24, The Associated Press quoted two people familiar with the matter as saying. Last week, Congressional leaders formally invited Netanyahu to come speak. However, the date of the speech had been in flux. According to news agency Reuters, Netanyahu, over his upcoming speech, said he was "very moved to have the privilege of representing Israel before both Houses of Congress and to present the truth about our just war." On Thursday, Israeli forces killed three Palestinians and injured at least 13 others in a raid on the occupied West Bank city of Jenin, Reuters quoted the Palestinian Health Ministry and medics as saying. The Palestine Red Crescent Society said it was treating at least six people who
"""
# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([[sample_sentiment['neg'], sample_sentiment['neu'], sample_sentiment['pos'], sample_sentiment['compound']]])

# Combine TF-IDF features with sentiment features for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features))

# Predict the label of the sample text
predicted_label = random_search.predict(sample_features)[0]
print("Predicted Label:", predicted_label)

Predicted Label: TERRORISM


In [9]:
# Sample text to test
sample_text = """
A group of hackers launched an attack on the government's website yesterday. The attack caused significant disruption to online services, leading to concerns about the security of sensitive information.
"""
# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([[sample_sentiment['neg'], sample_sentiment['neu'], sample_sentiment['pos'], sample_sentiment['compound']]])

# Combine TF-IDF features with sentiment features for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features))

# Predict the label of the sample text
predicted_label = random_search.predict(sample_features)[0]
print("Predicted Label:", predicted_label)

Predicted Label: CYBER CRIME


In [10]:
# Sample text to test
sample_text = '''Gaza, June 7 (UNI) The Palestinian mayor of central Gaza's Nuseirat refugee camp, Iyad al-Mughari, was killed in an Israeli airstrike on Thursday, Palestinian medical and security sources said. Palestinian security sources told Xinhua that the mayor was killed, along with a number of his family members, as an Israeli attack targeted a building in the camp. Medical sources said al-Mughari's body was transferred to al-Aqsa Hospital in Deir al-Balah city in central Gaza. Al-Mughari, one of the cadres of the Hamas movement, was appointed mayor by acclamation, the sources noted. The killing of al-Mughari came hours after the killing of about 35 Palestinians in an Israeli attack on a school affiliated with the United Nations Relief and Works Agency for Palestine Refugees in the Near East, which was housing displaced people in the Nuseirat camp. Israel said Hamas and Islamic Jihad "terrorists" were embedded themselves inside the school, and a number of steps had been taken to reduce the risk of harming uninvolved civilians. The Israeli army has been conducting a large-scale offensive on Gaza since Oct. 7, 2023, after Hamas carried out an unprecedented attack on the Israeli towns adjacent to the strip, during which approximately 1,200 people were killed and about 250 others were taken hostage. The Palestinian death toll from the ongoing Israeli attacks in the enclave has risen to 36,654, with 83,309 people injured, updated the Gaza health authorities on Thursday. UNI XINHUA GNK'''
# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([[sample_sentiment['neg'], sample_sentiment['neu'], sample_sentiment['pos'], sample_sentiment['compound']]])

# Combine TF-IDF features with sentiment features for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features))

# Predict the label of the sample text
predicted_label = random_search.predict(sample_features)[0]
print("Predicted Label:", predicted_label)

Predicted Label: TERRORISM
