In [6]:
# Install Kaggle and download dataset
!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d hijest/genre-classification-dataset-imdb
!unzip -o genre-classification-dataset-imdb.zip


cp: cannot stat 'kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb
License(s): other
genre-classification-dataset-imdb.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  genre-classification-dataset-imdb.zip
  inflating: Genre Classification Dataset/description.txt  
  inflating: Genre Classification Dataset/test_data.txt  
  inflating: Genre Classification Dataset/test_data_solution.txt  
  inflating: Genre Classification Dataset/train_data.txt  


In [7]:
import pandas as pd

# Check contents of the extracted folder
!ls "Genre Classification Dataset"

# Load training data
train_file = "Genre Classification Dataset/train_data.txt"
df = pd.read_csv(train_file, delimiter='\t', names=["Plot", "Genre"])

# Inspect the data
print(df.head())
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")


description.txt  test_data_solution.txt  test_data.txt	train_data.txt
                                                Plot  Genre
0  1 ::: Oscar et la dame rose (2009) ::: drama :...    NaN
1  2 ::: Cupid (1997) ::: thriller ::: A brother ...    NaN
2  3 ::: Young, Wild and Wonderful (1980) ::: adu...    NaN
3  4 ::: The Secret Sin (1915) ::: drama ::: To h...    NaN
4  5 ::: The Unrecovered (2007) ::: drama ::: The...    NaN
Dataset contains 54214 rows and 2 columns.


In [8]:
# Drop missing values
df.dropna(inplace=True)

# Convert text to lowercase
df['Plot'] = df['Plot'].str.lower()

# Display class distribution
print(df['Genre'].value_counts())


Series([], Name: count, dtype: int64)


In [12]:
print(df.empty)  # Should return False if the DataFrame is not empty


True


In [13]:
print(df.shape)  # Should return (rows, columns)


(0, 2)


In [14]:
print(df['Plot'].isna().sum(), df['Genre'].isna().sum())  # Count NaN values
print(df['Plot'].head(), df['Genre'].head())  # Check the content


0 0
Series([], Name: Plot, dtype: object) Series([], Name: Genre, dtype: float64)


In [15]:
print(df.head())  # Ensure it contains data


Empty DataFrame
Columns: [Plot, Genre]
Index: []


In [22]:
# Replace missing values for 'Plot' and 'Genre'
df.loc[:, 'Plot'] = df['Plot'].fillna('')
df.loc[:, 'Genre'] = df['Genre'].fillna('unknown')


In [23]:
df.loc[:, 'Genre'] = df['Genre'].fillna('unknown').astype(str)


In [24]:
# Check for required columns
if 'Plot' not in df.columns or 'Genre' not in df.columns:
    raise ValueError("The DataFrame must contain 'Plot' and 'Genre' columns.")

# Replace NaN values and enforce data types
df.loc[:, 'Plot'] = df['Plot'].fillna('')
df.loc[:, 'Genre'] = df['Genre'].fillna('unknown').astype(str)

# Remove rows with empty 'Plot'
df = df[df['Plot'].str.strip() != '']

# Handle cases where DataFrame might be empty
if df.empty:
    raise ValueError("The DataFrame is empty after preprocessing.")

# Convert 'Plot' to lowercase
df.loc[:, 'Plot'] = df['Plot'].str.lower()

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['Plot'], df['Genre'], test_size=0.1, random_state=42
)

# Summary
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


Training samples: 48792, Testing samples: 5422


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Check for required columns
if 'Plot' not in df.columns or 'Genre' not in df.columns:
    raise ValueError("The DataFrame must contain 'Plot' and 'Genre' columns.")

# Replace NaN values and enforce data types
df.loc[:, 'Plot'] = df['Plot'].fillna('')
df.loc[:, 'Genre'] = df['Genre'].fillna('unknown').astype(str)

# Remove rows with empty 'Plot'
df = df[df['Plot'].str.strip() != '']

# Handle cases where DataFrame might be empty
if df.empty:
    raise ValueError("The DataFrame is empty after preprocessing.")

# Convert 'Plot' to lowercase
df.loc[:, 'Plot'] = df['Plot'].str.lower()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['Plot'], df['Genre'], test_size=0.1, random_state=42
)

# TF-IDF Vectorization: Convert text data to numerical features
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Predict and evaluate
nb_predictions = nb_classifier.predict(X_test_tfidf)
print("Naive Bayes Classifier:")
print(f"Accuracy: {accuracy_score(y_test, nb_predictions)}")
print(classification_report(y_test, nb_predictions))

# Summary
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")



Naive Bayes Classifier:
Accuracy: 1.0
              precision    recall  f1-score   support

     unknown       1.00      1.00      1.00      5422

    accuracy                           1.00      5422
   macro avg       1.00      1.00      1.00      5422
weighted avg       1.00      1.00      1.00      5422

Training samples: 48792, Testing samples: 5422
