In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')


[nltk_data] Downloading package stopwords to C:\Users\HEEMA
[nltk_data]     SAMEERA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\HEEMA
[nltk_data]     SAMEERA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\HEEMA
[nltk_data]     SAMEERA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [14]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load data
data = pd.read_excel(r"C:\Users\HEEMA SAMEERA\OneDrive\Desktop\articlescateg.xlsx")

# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    tokens = text.split()  # Tokenization
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Remove stop words and lemmatize
    return ' '.join(tokens)

data['text'] = data['text'].apply(preprocess)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_features = vectorizer.fit_transform(data['text'])

# Sentiment analysis using NLTK VADER
analyzer = SentimentIntensityAnalyzer()

def sentiment_analysis(text):
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores

# Apply sentiment analysis
data['sentiment'] = data['text'].apply(sentiment_analysis)

# Convert sentiment scores to separate columns
data = data.join(pd.json_normalize(data['sentiment']))

# Drop the original 'sentiment' column
data = data.drop(columns=['sentiment'])

# Combine TF-IDF features with sentiment features
sentiment_features = data[['neg', 'neu', 'pos', 'compound']].values
X = np.hstack((tfidf_features.toarray(), sentiment_features))

y = data['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'k-NN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),  # Using Gaussian Naive Bayes
    'Gradient Boosting': GradientBoostingClassifier(),
}

# Define parameter grids for each model
param_grids = {
    'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']},
    'Decision Tree': {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]},
    'Random Forest': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'k-NN': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
    'Naive Bayes': {},  # Naive Bayes often doesn't require hyperparameter tuning
    'Gradient Boosting': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.5]},
}

# Set up cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid Search for each model
best_models = {}
best_scores = {}
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grids[name], cv=stratified_kfold, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    best_scores[name] = grid_search.best_score_
    print(f"{name}: Best Params - {grid_search.best_params_}, Best Score - {grid_search.best_score_:.4f}")

print("FINAL BEST MODEL DETAILS\n")
    
# Select the best model based on cross-validation score
best_model_name = max(best_scores, key=best_scores.get)
best_model = best_models[best_model_name]

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Best model:{best_model_name} Test Accuracy - {accuracy:.4f}")

# Sample text to test
sample_text = "Religion is a basic good for all human beings everywhere, therefore religious freedom is a universal human right. It is neither unfair nor parochial, but a requirement of justice."
# Preprocess the text
preprocessed_text = preprocess(sample_text)

# Vectorize the preprocessed text using the same vectorizer used for training
text_vector = vectorizer.transform([preprocessed_text])

# Make sure the number of features matches the training data
if text_vector.shape[1] != X_train.shape[1]:
    missing_features = X_train.shape[1] - text_vector.shape[1]
    text_vector = np.pad(text_vector.toarray(), ((0, 0), (0, missing_features)), mode='constant')

# Predict the label of the sample text using the best model
predicted_label = best_model.predict(text_vector)[0]
print("Predicted Label:", predicted_label)


Logistic Regression: Best Params - {'C': 10, 'solver': 'liblinear'}, Best Score - 0.7849
Decision Tree: Best Params - {'max_depth': None, 'min_samples_split': 5}, Best Score - 0.7316
Random Forest: Best Params - {'max_depth': None, 'n_estimators': 200}, Best Score - 0.8600
SVM: Best Params - {'C': 10, 'kernel': 'linear'}, Best Score - 0.7560
k-NN: Best Params - {'n_neighbors': 5, 'weights': 'distance'}, Best Score - 0.7091
Naive Bayes: Best Params - {}, Best Score - 0.6457
Gradient Boosting: Best Params - {'learning_rate': 0.5, 'n_estimators': 200}, Best Score - 0.8832
FINAL BEST MODEL DETAILS

Best model:Gradient Boosting Test Accuracy - 0.8864
Predicted Label: COMMUNAL/RELIGIOUS


In [13]:
sample_text = "cyber crimes all around"
# Preprocess the text
preprocessed_text = preprocess(sample_text)

# Vectorize the preprocessed text using the same vectorizer used for training
text_vector = vectorizer.transform([preprocessed_text])

# Make sure the number of features matches the training data
if text_vector.shape[1] != X_train.shape[1]:
    # Pad the text_vector with zeros for missing features
    missing_features = X_train.shape[1] - text_vector.shape[1]
    text_vector = np.pad(text_vector.toarray(), ((0, 0), (0, missing_features)), mode='constant')

# Predict the label of the sample text using the best model
predicted_label = best_model.predict(text_vector)[0]
print("Predicted Label:", predicted_label)

Predicted Label: CYBER CRIME
