<a href="https://colab.research.google.com/github/kalyani-m-g/CODSOFT/blob/main/codsoft_task1_kalyani_mg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q kagglehub transformers datasets accelerate torch

In [None]:
import kagglehub
import os
import pandas as pd
import numpy as np
import torch


In [None]:
path = kagglehub.dataset_download("hijest/genre-classification-dataset-imdb")
print("Dataset path:", path)

Using Colab cache for faster access to the 'genre-classification-dataset-imdb' dataset.
Dataset path: /kaggle/input/genre-classification-dataset-imdb


In [None]:
os.listdir(path)

['Genre Classification Dataset']

In [None]:
data_dir = os.path.join(path, "Genre Classification Dataset")
os.listdir(data_dir)


['description.txt',
 'test_data_solution.txt',
 'test_data.txt',
 'train_data.txt']

In [None]:
train_file = os.path.join(data_dir, "train_data.txt")

df = pd.read_csv(
    train_file,
    sep=" ::: ",
    engine="python",
    names=["id", "title", "genre", "plot"]
)

df.head()


Unnamed: 0,id,title,genre,plot
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [None]:
print(df.shape)
print(df.isnull().sum())

(54214, 4)
id       0
title    0
genre    0
plot     0
dtype: int64


In [None]:
df["genre"].value_counts().head(10)

Unnamed: 0_level_0,count
genre,Unnamed: 1_level_1
drama,13613
documentary,13096
comedy,7447
short,5073
horror,2204
thriller,1591
action,1315
western,1032
reality-tv,884
family,784


In [None]:
genre_counts = df["genre"].value_counts()
valid_genres = genre_counts[genre_counts >= 100].index

df = df[df["genre"].isin(valid_genres)]

print("Remaining samples:", len(df))
print("Remaining genres:", df["genre"].nunique())

Remaining samples: 54214
Remaining genres: 27


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["genre"])

num_labels = df["label"].nunique()
print("Number of labels:", num_labels)

Number of labels: 27


In [None]:
from sklearn.model_selection import train_test_split



In [None]:
df_small = df.sample(n=10000, random_state=42)

train_df, val_df = train_test_split(
    df_small,
    test_size=0.2,
    random_state=42,
    stratify=df_small["label"]
)


In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(
    train_df[["plot", "label"]]
)

val_dataset = Dataset.from_pandas(
    val_df[["plot", "label"]]
)

In [None]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def tokenize(batch):
    return tokenizer(
        batch["plot"],
        padding="max_length",
        truncation=True,
        max_length=256
    )


In [None]:
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

val_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    logging_steps=200,
    report_to="none"
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.0022,1.591658
2,1.4929,1.498376


TrainOutput(global_step=500, training_loss=1.6794453125, metrics={'train_runtime': 113.7068, 'train_samples_per_second': 140.713, 'train_steps_per_second': 4.397, 'total_flos': 1060211662848000.0, 'train_loss': 1.6794453125, 'epoch': 2.0})

In [None]:
trainer.evaluate()

{'eval_loss': 1.4983760118484497,
 'eval_runtime': 3.7936,
 'eval_samples_per_second': 527.2,
 'eval_steps_per_second': 16.607,
 'epoch': 2.0}

In [None]:
preds = trainer.predict(val_dataset)

y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

from sklearn.metrics import classification_report
print(classification_report(
    y_true,
    y_pred,
    target_names=label_encoder.classes_,
    zero_division=0
))

              precision    recall  f1-score   support

      action       0.44      0.14      0.22        49
       adult       0.00      0.00      0.00        21
   adventure       0.00      0.00      0.00        26
   animation       0.00      0.00      0.00        19
   biography       0.00      0.00      0.00        12
      comedy       0.53      0.57      0.55       266
       crime       0.00      0.00      0.00        20
 documentary       0.74      0.86      0.80       490
       drama       0.56      0.83      0.67       492
      family       0.00      0.00      0.00        27
     fantasy       0.00      0.00      0.00        14
   game-show       0.00      0.00      0.00         7
     history       0.00      0.00      0.00         8
      horror       0.32      0.72      0.45        78
       music       1.00      0.11      0.20        27
     musical       0.00      0.00      0.00         9
     mystery       0.00      0.00      0.00        10
        news       0.00    

In [None]:
model.save_pretrained("bert_genre_model")
tokenizer.save_pretrained("bert_genre_model")

('bert_genre_model/tokenizer_config.json',
 'bert_genre_model/special_tokens_map.json',
 'bert_genre_model/vocab.txt',
 'bert_genre_model/added_tokens.json',
 'bert_genre_model/tokenizer.json')

In [None]:
def predict(plot):
    device = model.device

    inputs = tokenizer(
        plot,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )


    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    pred_id = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([pred_id])[0]

In [None]:
print(model.device)

cuda:0


In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)
print(cm)


[[  7   0   0   0   0   0   0   2  19   0   0   0   0  14   0   0   0   0
    0   0   0   1   0   0   0   0   6]
 [  0   0   0   0   0   9   0   0   1   0   0   0   0   3   0   0   0   0
    0   0   0   8   0   0   0   0   0]
 [  2   0   0   0   0   1   0   4   9   0   0   0   0   8   0   0   0   0
    0   0   0   1   0   0   0   0   1]
 [  0   0   0   0   0   5   0   2   3   0   0   0   0   5   0   0   0   0
    0   0   0   4   0   0   0   0   0]
 [  0   0   0   0   0   1   0   6   5   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  1   0   0   0   0 152   0   8  89   0   0   0   0   5   0   0   0   0
    1   0   0   9   0   0   0   0   1]
 [  0   0   0   0   0   2   0   2  12   0   0   0   0   2   0   0   0   0
    0   0   0   2   0   0   0   0   0]
 [  2   0   0   0   0  12   0 423  29   0   0   0   0   5   0   0   0   0
    1   0   0  17   0   0   0   0   1]
 [  1   0   0   0   0  36   0  12 406   0   0   0   0  14   0   0   0   0
    0   0   0  19   0 

In [None]:
# Example prediction
test_plot = "A detective investigates a series of mysterious murders in a dark city."

predicted_genre = predict(test_plot)
print("Predicted Genre:", predicted_genre)


Predicted Genre: horror
