In [1]:
pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK resources
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')

# Load data from Excel file
data = pd.read_excel(r"C:\Users\HEEMA SAMEERA\OneDrive\Desktop\articlescateg.xlsx")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Sentiment analysis using VADER
sid = SentimentIntensityAnalyzer()
X_train_sentiment = X_train.apply(lambda x: sid.polarity_scores(x)['compound'])
X_test_sentiment = X_test.apply(lambda x: sid.polarity_scores(x)['compound'])

# Lemmatization
lemmatizer = WordNetLemmatizer()
X_train_lemmatized = X_train.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))
X_test_lemmatized = X_test.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

# Define pipeline with TF-IDF vectorizer and classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

# Define hyperparameters grid for GridSearchCV
param_grid = {
    'tfidf__max_features': [1000, 2000, 3000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train_lemmatized, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate model on test set
y_pred = grid_search.predict(X_test_lemmatized)
print(classification_report(y_test, y_pred))

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Sample text for testing
sample_text = "Religion is a basic good for all human beings everywhere, therefore religious freedom is a universal human right. It is neither unfair nor parochial, but a requirement of justice. "

# Lemmatize the sample text
sample_text_lemmatized = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(sample_text)])

# Make prediction using the trained model
predicted_label = grid_search.predict([sample_text_lemmatized])[0]

print("Predicted Label:", predicted_label)


[nltk_data] Downloading package vader_lexicon to C:\Users\HEEMA
[nltk_data]     SAMEERA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\HEEMA
[nltk_data]     SAMEERA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\HEEMA
[nltk_data]     SAMEERA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'clf__max_depth': None, 'clf__n_estimators': 100, 'tfidf__max_features': 2000, 'tfidf__ngram_range': (1, 1)}
Best Score: 0.9127731092436975
                     precision    recall  f1-score   support

ANTI GOVERNMENT ACT       0.75      1.00      0.86         3
 COMMUNAL/RELIGIOUS       1.00      1.00      1.00         5
        CYBER CRIME       1.00      1.00      1.00         6
         FOREIGNERS       1.00      1.00      1.00         4
     FUNDAMENTALISM       0.75      1.00      0.86         3
              JAILS       1.00      0.33      0.50         3
              MAFIA       1.00      1.00      1.00         4
             MURDER       1.00      1.00      1.00         6
               NDPS       1.00      1.00      1.00         3
         SEPARATISM       1.00      0.80      0.89         5
          TERRORISM       0.67      1.00      0.80         2

           accuracy                           

In [14]:
sample_text = "cyber attack "

# Lemmatize the sample text
sample_text_lemmatized = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(sample_text)])

# Make prediction using the trained model
predicted_label = grid_search.predict([sample_text_lemmatized])[0]

print("Predicted Label:", predicted_label)

Predicted Label: CYBER CRIME
