In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the datasets
train_df = pd.read_csv('train_data.txt', delimiter=':::', names=['index', 'movie_name', 'genre', 'plot'])
test_df = pd.read_csv('test_data.txt', delimiter=':::', names=['index', 'movie_name', 'plot'])
test_solution_df = pd.read_csv('test_data_solution.txt', delimiter=':::', names=['index', 'movie_name', 'genre', 'plot'])

# Combine train and test solution data for training
combined_df = pd.concat([train_df, test_solution_df])

# Preprocess the data
X = combined_df['plot']
y = combined_df['genre']

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(X)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear')
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(f"{name}:\n")
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("Classification Report:\n", classification_report(y_val, y_pred))
    print("\n")

# Make predictions on the test set
X_test = tfidf.transform(test_df['plot'])
best_model = models['Logistic Regression']  # Choose the best model based on validation performance
y_test_pred = best_model.predict(X_test)

# Add predictions to the test dataframe
test_df['predicted_genre'] = y_test_pred

# Display the predictions
print(test_df[['movie_name', 'predicted_genre']])


  train_df = pd.read_csv('train_data.txt', delimiter=':::', names=['index', 'movie_name', 'genre', 'plot'])
  test_df = pd.read_csv('test_data.txt', delimiter=':::', names=['index', 'movie_name', 'plot'])
  test_solution_df = pd.read_csv('test_data_solution.txt', delimiter=':::', names=['index', 'movie_name', 'genre', 'plot'])


Naive Bayes:

Accuracy: 0.537241156666513


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
                precision    recall  f1-score   support

      action        0.61      0.17      0.26       526
       adult        0.70      0.10      0.18       228
   adventure        0.61      0.08      0.14       304
   animation        0.00      0.00      0.00       211
   biography        0.00      0.00      0.00       103
      comedy        0.51      0.43      0.47      2988
       crime        0.00      0.00      0.00       223
 documentary        0.58      0.86      0.69      5185
       drama        0.48      0.82      0.60      5550
      family        1.00      0.00      0.01       288
     fantasy        0.00      0.00      0.00       147
   game-show        0.93      0.37      0.53        73
     history        0.00      0.00      0.00       116
      horror        0.73      0.44      0.55       883
       music        0.68      0.27      0.39       300
     musical        0.00      0.00      0.00       114
     mystery        0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
                precision    recall  f1-score   support

      action        0.49      0.34      0.40       526
       adult        0.60      0.27      0.37       228
   adventure        0.58      0.20      0.30       304
   animation        0.57      0.11      0.19       211
   biography        0.00      0.00      0.00       103
      comedy        0.55      0.60      0.57      2988
       crime        0.34      0.05      0.09       223
 documentary        0.69      0.83      0.75      5185
       drama        0.57      0.77      0.65      5550
      family        0.45      0.14      0.22       288
     fantasy        0.60      0.08      0.14       147
   game-show        0.73      0.45      0.56        73
     history        0.00      0.00      0.00       116
      horror        0.67      0.63      0.65       883
       music        0.63      0.48      0.55       300
     musical        0.50      0.04      0.08       114
     mystery        0.29      0.04      