# 1. Import the necessary libraries

In [1]:
#import libraries
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/classification-of-math-problems-by-kasut-academy/sample_submission.csv
/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv
/kaggle/input/classification-of-math-problems-by-kasut-academy/test.csv


# 2. Download data

In [2]:
train_df = pd.read_csv('/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv')
test_df = pd.read_csv('/kaggle/input/classification-of-math-problems-by-kasut-academy/test.csv')

# 3. Transform text data into numerical features using TF-IDF

In [3]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(train_df['Question'])
y = train_df['label']


# 4. Data partitioning for training and evaluation

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Training the logistic regression model

In [5]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 6. Model evaluation

In [6]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.82      0.76       522
           1       0.89      0.91      0.90       500
           2       0.76      0.60      0.67       221
           3       1.00      0.43      0.60        63
           4       0.74      0.75      0.75       331
           5       0.70      0.73      0.71       362
           6       1.00      0.05      0.10        20
           7       0.86      0.32      0.46        19

    accuracy                           0.76      2038
   macro avg       0.83      0.57      0.62      2038
weighted avg       0.77      0.76      0.76      2038



# 7. Applying the model to test data

In [7]:
X_test = vectorizer.transform(test_df['Question'])
test_predictions = model.predict(X_test)

# Create a submission file

In [8]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'label': test_predictions
})
submission.to_csv('submission1.csv', index=False)