In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pandas as pd

# Load the training data
train_data = pd.read_csv("train_data.txt", sep=" ::: ", engine="python", header=None)
train_data.columns = ["ID", "Title", "Genre", "Description"]

# Load the test data
test_data = pd.read_csv("test_data.txt", sep=" ::: ", engine="python", header=None)
test_data.columns = ["ID", "Title", "Description"]

# Split data into features and labels
X_train = train_data["Description"]
y_train = train_data["Genre"]
X_test = test_data["Description"]

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Predict genres for the test data
test_predictions = model.predict(X_test_vectorized)

# Create output file
with open('predictions.txt', 'w') as f:
    for idx, (title, pred_genre) in enumerate(zip(test_data["Title"], test_predictions), start=1):
        f.write(f"{idx} ::: {title} ::: {pred_genre}\n")

print("Predictions saved to predictions.txt")


Predictions saved to predictions.txt
