In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/description.txt
/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data_solution.txt
/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt
/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt


# Import necessary libraries

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer #For TF-IDF
from sklearn.naive_bayes import MultinomialNB # A good choice for text classification as Niave bayes
from sklearn.linear_model import LogisticRegression # for LogisticRegression as classification
from sklearn.svm import LinearSVC # Support Vector Machine for text classification
from sklearn.metrics import accuracy_score, classification_report
import re # For regular expressions, useful for text cleaning
import nltk # Natural Language Toolkit
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [6]:
nltk.download('popular', quiet=True)
#Quiet=True is set to let us know that the package is downloaded successfully without any clutter like 
#"Downloading..." messages, or "Unzipping..."etc..

True

# 1. Data Loading 

In [7]:
# Load the datasets using pandas. We'll specify the delimiter as ':::'
# and use 'latin-1' encoding because the default 'utf-8' might cause errors
# Using `sep=':::'` to define the column separator, and `engine='python'for compatibility with multi-character separators.

In [8]:
print("Loading datasets...")
try:
    train_df = pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt', sep=':::', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python', encoding='latin-1')
    test_df = pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt', sep=':::', header=None, names=['ID', 'TITLE', 'DESCRIPTION'], engine='python', encoding='latin-1')
    test_solution_df = pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data_solution.txt', sep=':::', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python', encoding='latin-1')
    print("Datasets loaded successfully.")
    print(f"Train data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
    print(f"Test solution data shape: {test_solution_df.shape}")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Make sure the file paths are correct.")
    print("Please verify the dataset paths in your Kaggle environment.")
    exit() # Exit if files are not found

Loading datasets...
Datasets loaded successfully.
Train data shape: (54214, 4)
Test data shape: (54200, 3)
Test solution data shape: (54200, 4)


# 2. Data Preprocessing Function 

In [9]:
# preprocess_text function will clean the text data (movie descriptions)
# It performs tokenization, lowercasing, stop word removal, and stemming.

In [10]:
stop_words = set(stopwords.words('english')) # Get English stop words
stemmer = PorterStemmer() # Initialize the Porter Stemmer

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenise the text (split into words) and remove stop words, then stem
    tokens = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    # Join the processed tokens back into a single string
    return ' '.join(tokens)

print("\nStarting text preprocessing...")
# Apply the preprocessing function to the 'DESCRIPTION' column of both dataframes
train_df['CLEANED_DESCRIPTION'] = train_df['DESCRIPTION'].apply(preprocess_text)
test_df['CLEANED_DESCRIPTION'] = test_df['DESCRIPTION'].apply(preprocess_text)
test_solution_df['CLEANED_DESCRIPTION'] = test_solution_df['DESCRIPTION'].apply(preprocess_text)
print("Text preprocessing complete.")
print("\nExample of preprocessed text (train_df):")
print(train_df[['DESCRIPTION', 'CLEANED_DESCRIPTION']].head())


Starting text preprocessing...
Text preprocessing complete.

Example of preprocessed text (train_df):
                                         DESCRIPTION  \
0   Listening in to a conversation between his do...   
1   A brother and sister with a past incestuous r...   
2   As the bus empties the students for their fie...   
3   To help their unemployed father make ends mee...   
4   The film's title refers not only to the un-re...   

                                 CLEANED_DESCRIPTION  
0  listen convers doctor parent yearold oscar lea...  
1  brother sister past incestu relationship curre...  
2  bu empti student field trip museum natur histo...  
3  help unemploy father make end meet edith twin ...  
4  film titl refer unrecov bodi ground zero also ...  


# 3. Text Vectorization (TF-IDF)

In [11]:
# Convert the cleaned text descriptions into numerical features using TF-IDF.
# TF-IDF creates a matrix where each row is a movie and each column is a word.
# The values represent the importance of that word in that movie's description.

In [12]:
print("\nStarting TF-IDF vectorization...")
# Initialize TfidfVectorizer. We can limit the number of features (words)
# to consider for performance and to focus on the most important ones.
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Consider top 5000 important words

# Fit the vectorizer on the training data and transform it.
# 'fit_transform' learns the vocabulary from training data and then converts it.
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['CLEANED_DESCRIPTION'])

# Transform the test data using the *same* vectorizer fitted on training data.
# We only use 'transform' here because we don't want the test data to influence
# the vocabulary learned from the training data.
X_test_tfidf = tfidf_vectorizer.transform(test_df['CLEANED_DESCRIPTION'])

# Extract the target variable (genres) from the training data
y_train = train_df['GENRE']
# Extract the true genres from the test solution data for evaluation
y_test_true = test_solution_df['GENRE']

print("TF-IDF vectorization complete.")
print(f"Shape of TF-IDF matrix for training data: {X_train_tfidf.shape}")
print(f"Shape of TF-IDF matrix for test data: {X_test_tfidf.shape}")


Starting TF-IDF vectorization...
TF-IDF vectorization complete.
Shape of TF-IDF matrix for training data: (54214, 5000)
Shape of TF-IDF matrix for test data: (54200, 5000)


# 4. Model Training

In [13]:
# Choose a machine learning classifier. We'll use Logistic Regression first,
# as it's a good general-purpose classifier.

print("\nStarting model training (Logistic Regression)...")
# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42) # Increased max_iter for convergence

# Train the model using the TF-IDF features and corresponding genres
model.fit(X_train_tfidf, y_train)
print("Model training complete.")


Starting model training (Logistic Regression)...
Model training complete.


# 5. Prediction

In [14]:
# Use the trained model to predict genres for the test data.

print("\nMaking predictions on test data...")
y_pred = model.predict(X_test_tfidf)
print("Predictions complete.")


Making predictions on test data...
Predictions complete.


# 6. Model Evaluation 

In [15]:
# Evaluate the performance of the trained model by comparing its predictions
# with the actual genres from the test_data_solution.txt file.

In [16]:
print("\n--- Model Evaluation ---")
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test_true, y_pred)
print(f"Accuracy: {accuracy:.4f}")


--- Model Evaluation ---
Accuracy: 0.5865


In [17]:
# Print a classification report, which includes precision, recall, and f1-score
# for each genre. This gives a more detailed understanding of performance.
print("\nClassification Report:")
print(classification_report(y_test_true, y_pred, zero_division=0))



Classification Report:
               precision    recall  f1-score   support

      action        0.48      0.31      0.37      1314
       adult        0.62      0.26      0.37       590
   adventure        0.58      0.18      0.28       775
   animation        0.51      0.09      0.15       498
   biography        0.00      0.00      0.00       264
      comedy        0.54      0.60      0.57      7446
       crime        0.33      0.04      0.07       505
 documentary        0.67      0.84      0.75     13096
       drama        0.55      0.77      0.64     13612
      family        0.47      0.09      0.15       783
     fantasy        0.46      0.05      0.09       322
   game-show        0.88      0.52      0.66       193
     history        1.00      0.00      0.01       243
      horror        0.66      0.59      0.62      2204
       music        0.64      0.46      0.53       731
     musical        0.27      0.01      0.03       276
     mystery        0.38      0.02      

In [19]:
# --- Test the model on a new/unseen movie description ---
print("\n--- Testing on New Movie Descriptions ---")

# Step 1: Write some new descriptions (list of strings)
sample_descriptions = [
    "A young boy discovers he has magical powers and attends a school for wizards.",
    "Two detectives hunt a serial killer who uses the seven deadly sins as motives.",
    "A hilarious story of three friends who go on a crazy road trip.",
    "A heartbreaking tale of love and sacrifice set during World War II."
]

# Step 2: Transform the descriptions using the SAME tfidf_vectorizer
sample_tfidf = tfidf_vectorizer.transform(sample_descriptions)

# Step 3: Predict genres
predicted_genres = model.predict(sample_tfidf)

# Step 4: Print results
for desc, genre in zip(sample_descriptions, predicted_genres):
    print(f"\nDescription: {desc}\nPredicted Genre: {genre}")



--- Testing on New Movie Descriptions ---

Description: A young boy discovers he has magical powers and attends a school for wizards.
Predicted Genre:  drama 

Description: Two detectives hunt a serial killer who uses the seven deadly sins as motives.
Predicted Genre:  thriller 

Description: A hilarious story of three friends who go on a crazy road trip.
Predicted Genre:  comedy 

Description: A heartbreaking tale of love and sacrifice set during World War II.
Predicted Genre:  drama 
