In [8]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Load the train and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Merge `test_data` with `train_data` if needed to match columns
test_data_merged = test_data.merge(train_data.drop(columns='Score'), on='Id', how='left')

# Preprocess text by filling NaN values with empty strings and combining Summary and Text
train_data['Summary'] = train_data['Summary'].fillna("")
train_data['Text'] = train_data['Text'].fillna("")
train_data['Combined_Text'] = train_data['Summary'] + " " + train_data['Text']

test_data_merged['Summary'] = test_data_merged['Summary'].fillna("")
test_data_merged['Text'] = test_data_merged['Text'].fillna("")
test_data_merged['Combined_Text'] = test_data_merged['Summary'] + " " + test_data_merged['Text']
train_data = train_data.dropna(subset=['Score'])

# Define features and target
X = train_data['Combined_Text']
y = train_data['Score'].astype(int)

# Split train data for evaluation
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model pipeline with TF-IDF and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Parameter grid for GridSearchCV
param_grid = {
    'tfidf__max_features': [500, 1000, 1500],
    'classifier__C': [0.1, 1, 10]
}

# Run GridSearchCV to tune parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Best model evaluation on test set
best_model = grid_search.best_estimator_
Y_test_predictions = best_model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_test_predictions)
classification_report_text = classification_report(Y_test, Y_test_predictions)

# Print model evaluation
print("Best Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report_text)



Best Model Accuracy: 0.6327856491252873
Classification Report:
               precision    recall  f1-score   support

           1       0.57      0.53      0.55     18074
           2       0.40      0.22      0.29     17604
           3       0.44      0.31      0.36     35179
           4       0.47      0.31      0.37     67127
           5       0.71      0.90      0.79    159085

    accuracy                           0.63    297069
   macro avg       0.52      0.45      0.47    297069
weighted avg       0.60      0.63      0.60    297069



In [9]:
# Apply the best model to test data for final prediction
test_bow_features = test_data_merged['Combined_Text']
test_predictions = best_model.predict(test_bow_features)

# Create a submission file
submission = pd.DataFrame({'Id': test_data_merged['Id'], 'Score': test_predictions})
submission.to_csv('submission.csv', index=False)

In [10]:
print(best_model)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1500, stop_words='english')),
                ('classifier',
                 LogisticRegression(C=10, max_iter=1000, random_state=42))])
