# Movie Genre Classification 

### Import necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

### Load the dataset

In [2]:
# Load the training data
train_data = pd.read_csv('train_data.txt', delimiter=':::', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

# Load the test data
test_data = pd.read_csv('test_data.txt', delimiter=':::', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])

### Preprocess the data

In [3]:
# Extract features and labels
X = train_data['DESCRIPTION']
y = train_data['GENRE']

### Split the data into training and validation sets

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### Vectorize the text data using TF-IDF

In [5]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the validation data
X_val_tfidf = tfidf.transform(X_val)

### Train the model using Logistic Regression

In [6]:
# Initialize Logistic Regression model
model = LogisticRegression(max_iter=2000)

# Train the model
model.fit(X_train_tfidf, y_train)

### Save the model and vectorizer for future use

In [7]:
# Save the trained model
joblib.dump(model, 'movie_genre_predictor.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

### Load the model and vectorizer

In [12]:
# Load the trained model
loaded_model = joblib.load('movie_genre_predictor.pkl')

# Load the TF-IDF vectorizer
loaded_tfidf = joblib.load('tfidf_vectorizer.pkl')

### Evaluate the model

In [13]:
# Transform the validation data using the loaded TF-IDF vectorizer
X_val_tfidf = loaded_tfidf.transform(X_val)

# Predict on validation data using the loaded model
y_pred = loaded_model.predict(X_val_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred))

Accuracy: 0.5795444065295582
Classification Report:
               precision    recall  f1-score   support

      action        0.53      0.27      0.35       263
       adult        0.71      0.21      0.33       112
   adventure        0.41      0.14      0.21       139
   animation        0.65      0.11      0.18       104
   biography        0.00      0.00      0.00        61
      comedy        0.51      0.58      0.55      1443
       crime        0.43      0.03      0.05       107
 documentary        0.66      0.85      0.74      2659
       drama        0.55      0.78      0.64      2697
      family        0.41      0.07      0.12       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.94      0.42      0.59        40
     history        0.00      0.00      0.00        45
      horror        0.63      0.56      0.59       431
       music        0.63      0.47      0.54       144
     musical        1.00      0.02      0.04        50
     mystery

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Predict genres for the test data

In [15]:
# Transform the test data using the loaded TF-IDF vectorizer
X_test_tfidf = loaded_tfidf.transform(test_data['DESCRIPTION'])

# Predict the genres for test data using the loaded model
test_data['PREDICTED_GENRE'] = loaded_model.predict(X_test_tfidf)

# Save the classified descriptions to a new .txt file with a single character delimiter
test_data.to_csv('classified_test_data_temp.txt', sep='|', index=False, header=False)

# Optional: Replace the single character delimiter with ':::'
with open('classified_test_data_temp.txt', 'r') as file:
    data = file.read().replace('|', ':::')

with open('classified_test_data.txt', 'w') as file:
    file.write(data)

# Clean up the temporary file
import os
os.remove('classified_test_data_temp.txt')