In [None]:
pip install pandas scikit-learn



In [None]:
from google.colab import drive
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pickle
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# Download NLTK resources
nltk.download('stopwords')

# Mount Google Drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Semeval2025/task_9

# Load the Dataset
try:
    df = pd.read_csv('incidents_train (1)1.csv')
except FileNotFoundError:
    print("File not found. Please check the path.")

# Feature Engineering: Text Preprocessing
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Apply preprocessing to 'text' and 'title' columns
df['text'] = df['text'].apply(preprocess_text)
df['title'] = df['title'].apply(preprocess_text)

# Concatenate 'text' and 'title' columns
df['input_text'] = df['text'] + " " + df['title']

# Feature Engineering: Stemming and Stop Words Removal
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def stem_and_remove_stopwords(text):
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['input_text'] = df['input_text'].apply(stem_and_remove_stopwords)

# Extract features using BoW and TF-IDF
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['input_text'])

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_bow)

# Target Variables
y_hazard = df['hazard-category']
y_product = df['product-category']

# Target Variables
y2_hazard = df['hazard']
y2_product = df['product']

# Hyperparameter Tuning for SVM and Logistic Regression using GridSearchCV
# SVM Hyperparameter Tuning
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf']  # You can also try 'poly' or 'sigmoid'
}

# Grid search for hazard category with SVM
grid_svm_hazard = GridSearchCV(SVC(), svm_param_grid, scoring='f1_weighted', cv=5)
grid_svm_hazard.fit(X_tfidf, y_hazard)

# Grid search for product category with SVM
grid_svm_product = GridSearchCV(SVC(), svm_param_grid, scoring='f1_weighted', cv=5)
grid_svm_product.fit(X_tfidf, y_product)

# Best models from grid search
best_svm_hazard = grid_svm_hazard.best_estimator_
best_svm_product = grid_svm_product.best_estimator_

# Save the best models and vectorizers
with open('SVM_hazard_category_model.pkl', 'wb') as f:
    pickle.dump(best_svm_hazard, f)
with open('SVM_product_category_model.pkl', 'wb') as f:
    pickle.dump(best_svm_product, f)
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('tfidf_transformer.pkl', 'wb') as f:
    pickle.dump(tfidf_transformer, f)

print("Trained models and vectorizer saved successfully.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive
/content/drive/My Drive/Semeval2025/task_9




Trained models and vectorizer saved successfully.
