In [2]:
# enter your python code here
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load data
data = pd.read_excel(r"C:\Users\HEEMA SAMEERA\OneDrive\Desktop\newsarticleswithcategories.xlsx")

# Preprocessing function
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    tokens = text.split()  # Tokenization
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Remove stop words and lemmatize
    return ' '.join(tokens)

data['text'] = data['text'].apply(preprocess)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(data['text'])

# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

def sentiment_analysis(text):
    sentiment_scores = analyzer.polarity_scores(text)
    return [sentiment_scores['neg'], sentiment_scores['neu'], sentiment_scores['pos'], sentiment_scores['compound']]

data['sentiment'] = data['text'].apply(sentiment_analysis)

# Define custom patterns for each category
patterns = {
    'ANTI_GOVERNMENT_ACT': r'anti-government|protest|demonstration|riot|rebellion|sedition',
    'ARSON': r'arson|fire|burn',
    'COMMUNAL_RELIGIOUS': r'communal|religious|sectarian|sectarianism',
    'CYBER_CRIME': r'cybercrime|cyber crime|online crime|hacking|phishing',
    'JAILS': r'jail|prison|detention center',
    'MAFIA': r'mafia|gang|organized crime|mob',
    'MURDER': r'murder|kill|homicide|killed',
    'NDPS': r'NDPS|narcotic|drug|psychoactive substance',
    'RAPE': r'rape|sexual assault|sexual violence',
    'TERRORISM': r'terrorism|terrorist|bombing|attack',
    'THEFT_ROBBERY': r'theft|robbery|burglary|steal'
}

# Function to identify categories based on custom patterns
def identify_category(text):
    categories = []
    for category, pattern in patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            categories.append(category)
    return categories

# Apply custom pattern matching to identify categories
data['custom_categories'] = data['text'].apply(identify_category)

# Convert custom categories to dummy variables
custom_dummies = pd.get_dummies(data['custom_categories'].apply(pd.Series).stack()).sum(level=0)

# Reshape sentiment features to be 2-dimensional
sentiment_features = np.array(data['sentiment'].tolist())

# Ensure custom_dummies has the same number of samples as X_tfidf and align their indices
custom_dummies = custom_dummies.reindex(index=range(X_tfidf.shape[0]), columns=range(custom_dummies.shape[1]), fill_value=0)

# Combine TF-IDF features with sentiment features and custom pattern dummy variables
X = np.hstack((X_tfidf.toarray(), sentiment_features, custom_dummies.to_numpy()))
y = data['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Gradient Boosting classifier
gradient_boosting = GradientBoostingClassifier(random_state=42)
gradient_boosting.fit(X_train, y_train)

# Sample text to test
sample_text = """
A group of hackers launched a cyber attack on the government's website yesterday. The attack caused significant disruption to online services, leading to concerns about the security of sensitive information.
"""

# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text using TF-IDF
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([sample_sentiment])

# # Apply custom pattern matching to identify categories for the sample text
# sample_custom_categories = identify_category(sample_text_processed)

# # Ensure sample_custom_dummies has the same number of rows as sample_text_vectorized and sample_sentiment_features
# sample_custom_dummies = sample_custom_dummies.reindex(index=range(sample_text_vectorized.shape[0]), columns=custom_dummies.columns, fill_value=0)

# # Combine TF-IDF features with sentiment features and custom pattern dummy variables for the sample text
# sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features, sample_custom_dummies.to_numpy()))

# # Predict the label of the sample text
# predicted_label = gradient_boosting.predict(sample_features)[0]
# print("Predicted Label:", predicted_label)

# Identify categories for the sample text
sample_custom_categories = identify_category(sample_text_processed)

# Convert sample_custom_categories to dummy variables
sample_custom_dummies = pd.get_dummies(pd.Series(sample_custom_categories)).sum(level=0)

# Ensure sample_custom_dummies has the same number of columns as custom_dummies and fill with zeros if necessary
sample_custom_dummies = sample_custom_dummies.reindex(columns=custom_dummies.columns, fill_value=0)

# Combine TF-IDF features with sentiment features and custom pattern dummy variables for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features, sample_custom_dummies.to_numpy()))

# Predict the label of the sample text
predicted_label = gradient_boosting.predict(sample_features)[0]
print("Predicted Label:", predicted_label)



[nltk_data] Downloading package stopwords to C:\Users\HEEMA
[nltk_data]     SAMEERA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\HEEMA
[nltk_data]     SAMEERA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\HEEMA
[nltk_data]     SAMEERA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  custom_dummies = pd.get_dummies(data['custom_categories'].apply(pd.Series).stack()).sum(level=0)
  custom_dummies = pd.get_dummies(data['custom_categories'].apply(pd.Series).stack()).sum(level=0)


Predicted Label: CYBER CRIME


  sample_custom_dummies = pd.get_dummies(pd.Series(sample_custom_categories)).sum(level=0)


In [3]:
from sklearn.metrics import classification_report

# Predict the labels for the test set
y_pred = gradient_boosting.predict(X_test)

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                     precision    recall  f1-score   support

ANTI GOVERNMENT ACT       0.67      0.86      0.75         7
              ARSON       1.00      0.86      0.92         7
 COMMUNAL/RELIGIOUS       0.40      0.29      0.33         7
        CYBER CRIME       0.50      0.67      0.57         3
              JAILS       1.00      0.33      0.50         6
              MAFIA       0.23      0.75      0.35         4
             MURDER       1.00      0.75      0.86         4
               NDPS       0.75      0.60      0.67         5
               RAPE       1.00      1.00      1.00         2
          TERRORISM       1.00      0.62      0.77         8
      THEFT/ROBBERY       1.00      1.00      1.00         2

           accuracy                           0.65        55
          macro avg       0.78      0.70      0.70        55
       weighted avg       0.78      0.65      0.67        55



In [4]:
# Sample text to test
sample_text = "18 year old gang raped in hyd "

# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text using TF-IDF
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([sample_sentiment])

# Apply custom pattern matching to identify categories for the sample text
sample_custom_categories = identify_category(sample_text_processed)

# Ensure sample_custom_dummies has the same number of rows as sample_text_vectorized and sample_sentiment_features
sample_custom_dummies = sample_custom_dummies.reindex(index=range(sample_text_vectorized.shape[0]), columns=custom_dummies.columns, fill_value=0)

# Combine TF-IDF features with sentiment features and custom pattern dummy variables for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features, sample_custom_dummies.to_numpy()))

# Predict the label of the sample text
predicted_label = gradient_boosting.predict(sample_features)[0]
print("Predicted Label:", predicted_label)


Predicted Label: RAPE


In [5]:
# Sample text to test
sample_text = "cyber attack everywhere "

# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text using TF-IDF
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([sample_sentiment])

# Apply custom pattern matching to identify categories for the sample text
sample_custom_categories = identify_category(sample_text_processed)

# Ensure sample_custom_dummies has the same number of rows as sample_text_vectorized and sample_sentiment_features
sample_custom_dummies = sample_custom_dummies.reindex(index=range(sample_text_vectorized.shape[0]), columns=custom_dummies.columns, fill_value=0)

# Combine TF-IDF features with sentiment features and custom pattern dummy variables for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features, sample_custom_dummies.to_numpy()))

# Predict the label of the sample text
predicted_label = gradient_boosting.predict(sample_features)[0]
print("Predicted Label:", predicted_label)


Predicted Label: MAFIA


In [6]:
# Sample text to test
sample_text = "knife kills uncle"

# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text using TF-IDF
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([sample_sentiment])

# Apply custom pattern matching to identify categories for the sample text
sample_custom_categories = identify_category(sample_text_processed)

# Ensure sample_custom_dummies has the same number of rows as sample_text_vectorized and sample_sentiment_features
sample_custom_dummies = sample_custom_dummies.reindex(index=range(sample_text_vectorized.shape[0]), columns=custom_dummies.columns, fill_value=0)

# Combine TF-IDF features with sentiment features and custom pattern dummy variables for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features, sample_custom_dummies.to_numpy()))

# Predict the label of the sample text
predicted_label = gradient_boosting.predict(sample_features)[0]
print("Predicted Label:", predicted_label)


Predicted Label: MAFIA


In [7]:
# Sample text to test
sample_text = "drugs found in the hitech city tunnel along with 12 other weapons "
# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text using TF-IDF
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([sample_sentiment])

# Apply custom pattern matching to identify categories for the sample text
sample_custom_categories = identify_category(sample_text_processed)

# Ensure sample_custom_dummies has the same number of rows as sample_text_vectorized and sample_sentiment_features
sample_custom_dummies = sample_custom_dummies.reindex(index=range(sample_text_vectorized.shape[0]), columns=custom_dummies.columns, fill_value=0)

# Combine TF-IDF features with sentiment features and custom pattern dummy variables for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features, sample_custom_dummies.to_numpy()))

# Predict the label of the sample text
predicted_label = gradient_boosting.predict(sample_features)[0]
print("Predicted Label:", predicted_label)


Predicted Label: NDPS


In [8]:
# Sample text to test
sample_text = "terrorism breaks out in mumbai "
# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text using TF-IDF
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([sample_sentiment])

# Apply custom pattern matching to identify categories for the sample text
sample_custom_categories = identify_category(sample_text_processed)

# Ensure sample_custom_dummies has the same number of rows as sample_text_vectorized and sample_sentiment_features
sample_custom_dummies = sample_custom_dummies.reindex(index=range(sample_text_vectorized.shape[0]), columns=custom_dummies.columns, fill_value=0)

# Combine TF-IDF features with sentiment features and custom pattern dummy variables for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features, sample_custom_dummies.to_numpy()))

# Predict the label of the sample text
predicted_label = gradient_boosting.predict(sample_features)[0]
print("Predicted Label:", predicted_label)


Predicted Label: TERRORISM


In [9]:
# Sample text to test
sample_text = ''' An Israeli strike on a school sheltering displaced Palestinians in central Gaza killed at least 33 people on Thursday. Following the attack, the Israel Defense Forces said Hamas terrorists were operating from within the school. Meanwhile, Qatar Foreign Ministry has said that Hamas has not yet handed mediators its response to the latest ceasefire proposal and is still studying it. Amid the ongoing war, Israeli Prime Minister Benjamin Netanyahu is set to address a joint meeting of Congress on July 24, The Associated Press reported, citing two people familiar with the matter. The war began after Hamas attacked Israel on October 7, killing around 1,200 people and capturing more than 250 hostages.At least 33 people were killed after Israel hit a Gaza school on Thursday. Video footage showed Palestinians hauling away bodies and scores of injured in a hospital after the attack, which took place at a sensitive moment in mediated talks on a ceasefire that would involve releasing hostages held by Hamas and some of the Palestinians held in Israeli jails. Following the strike, the Israeli military said the central Gaza compound was being used by terrorists. Addressing a press conference, an IDF spokesperson said those targeted were members of Hamas's elite Nukhba force and of the Palestinian Islamic Jihad terror group who "directed terror attacks from the area of the school while exploiting it as a civilian location and as a shelter. The terrorists inside this school were planning more attacks against Israelis, some of them imminent." Meanwhile, the US State Department has said the country has been in contact with Israel about the strike at Gaza school. State Department spokesperson Matthew Miller also said that Washington expects Israel to be fully transparent in making information about the strike public. On Thursday, Qatari foreign ministry spokesperson Majed Al-Ansari said Hamas has not yet handed mediators its response to the latest ceasefire proposal and is still studying it, adding that Qatari, Egyptian and US mediators were still making efforts. Talks began on Wednesday when Central Intelligence Agency (CIA) director William Burns met senior officials from Qatar and Egypt in Doha to discuss a proposal that US President Joe Biden publicly endorsed last week. Russia and China, which hold veto powers in the United Nations Security Council, raised concerns on Thursday with a US draft resolution that would back a proposal - outlined by President Joe Biden - for a ceasefire between Israel and Hamas. The council's only Arab member, Algeria, also signalled it was not ready to back the text, diplomats said. A resolution needs at least nine votes in favour and no vetoes by the US, France, Britain, China or Russia to pass. Benjamin Netanyahu is set to address a joint meeting of Congress on July 24, The Associated Press quoted two people familiar with the matter as saying. Last week, Congressional leaders formally invited Netanyahu to come speak. However, the date of the speech had been in flux. According to news agency Reuters, Netanyahu, over his upcoming speech, said he was "very moved to have the privilege of representing Israel before both Houses of Congress and to present the truth about our just war." On Thursday, Israeli forces killed three Palestinians and injured at least 13 others in a raid on the occupied West Bank city of Jenin, Reuters quoted the Palestinian Health Ministry and medics as saying. The Palestine Red Crescent Society said it was treating at least six people who were shot, four who sustained shrapnel wounds and one person who was run over by a military jeep. It said its teams were fired at while recovering some of the dead.'''
# Preprocess the sample text
sample_text_processed = preprocess(sample_text)

# Vectorize the preprocessed sample text using TF-IDF
sample_text_vectorized = vectorizer.transform([sample_text_processed])

# Perform sentiment analysis on the sample text
sample_sentiment = sentiment_analysis(sample_text_processed)
sample_sentiment_features = np.array([sample_sentiment])

# Apply custom pattern matching to identify categories for the sample text
sample_custom_categories = identify_category(sample_text_processed)

# Ensure sample_custom_dummies has the same number of rows as sample_text_vectorized and sample_sentiment_features
sample_custom_dummies = sample_custom_dummies.reindex(index=range(sample_text_vectorized.shape[0]), columns=custom_dummies.columns, fill_value=0)

# Combine TF-IDF features with sentiment features and custom pattern dummy variables for the sample text
sample_features = np.hstack((sample_text_vectorized.toarray(), sample_sentiment_features, sample_custom_dummies.to_numpy()))

# Predict the label of the sample text
predicted_label = gradient_boosting.predict(sample_features)[0]
print("Predicted Label:", predicted_label)

Predicted Label: TERRORISM


In [10]:
from sklearn.metrics import accuracy_score

# Predict labels for the test set
y_pred = gradient_boosting.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)


Accuracy: 0.6545454545454545
