<a href="https://colab.research.google.com/github/kalyani-m-g/CODSOFT/blob/main/codsoft_task1_kalyani_mg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
!pip install -q kagglehub transformers datasets accelerate torch scikit-learn


In [27]:
import os
import numpy as np
import pandas as pd
import torch

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)


In [28]:
import kagglehub

path = kagglehub.dataset_download(
    "hijest/genre-classification-dataset-imdb"
)

data_dir = os.path.join(path, "Genre Classification Dataset")
train_file = os.path.join(data_dir, "train_data.txt")


Using Colab cache for faster access to the 'genre-classification-dataset-imdb' dataset.


In [29]:
df = pd.read_csv(
    train_file,
    sep=" ::: ",
    engine="python",
    names=["id", "title", "genre", "plot"]
)

print(df.shape)
df.head()


(54214, 4)


Unnamed: 0,id,title,genre,plot
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [30]:
genre_counts = df["genre"].value_counts()
valid_genres = genre_counts[genre_counts >= 100].index

df = df[df["genre"].isin(valid_genres)].reset_index(drop=True)

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["genre"])

num_labels = df["label"].nunique()
print("Number of labels:", num_labels)


Number of labels: 27


In [31]:
df_small = df.sample(n=10000, random_state=42)

train_df, val_df = train_test_split(
    df_small,
    test_size=0.2,
    stratify=df_small["label"],
    random_state=42
)

train_dataset = Dataset.from_pandas(train_df[["plot", "label"]])
val_dataset = Dataset.from_pandas(val_df[["plot", "label"]])


In [32]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["plot"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

val_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [33]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    logging_steps=200,
    report_to="none"
)


In [35]:
print(type(model))


<class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'>


In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.6315,1.504602
2,1.2674,1.410303


TrainOutput(global_step=1000, training_loss=1.5593038024902344, metrics={'train_runtime': 119.8719, 'train_samples_per_second': 133.476, 'train_steps_per_second': 8.342, 'total_flos': 1060211662848000.0, 'train_loss': 1.5593038024902344, 'epoch': 2.0})

In [37]:
preds = trainer.predict(val_dataset)

y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

print(
    classification_report(
        y_true,
        y_pred,
        target_names=label_encoder.classes_,
        zero_division=0
    )
)


              precision    recall  f1-score   support

      action       0.54      0.43      0.48        49
       adult       0.00      0.00      0.00        21
   adventure       0.00      0.00      0.00        26
   animation       0.00      0.00      0.00        19
   biography       0.00      0.00      0.00        12
      comedy       0.57      0.58      0.58       266
       crime       0.00      0.00      0.00        20
 documentary       0.76      0.85      0.80       490
       drama       0.59      0.80      0.68       492
      family       0.00      0.00      0.00        27
     fantasy       0.00      0.00      0.00        14
   game-show       0.00      0.00      0.00         7
     history       0.00      0.00      0.00         8
      horror       0.44      0.76      0.56        78
       music       0.73      0.41      0.52        27
     musical       0.00      0.00      0.00         9
     mystery       0.00      0.00      0.00        10
        news       0.00    

In [38]:
model.save_pretrained("bert_genre_model")
tokenizer.save_pretrained("bert_genre_model")


('bert_genre_model/tokenizer_config.json',
 'bert_genre_model/special_tokens_map.json',
 'bert_genre_model/vocab.txt',
 'bert_genre_model/added_tokens.json',
 'bert_genre_model/tokenizer.json')

In [39]:
def predict_genre(plot_text):
    model.eval()
    device = model.device

    inputs = tokenizer(
        plot_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    pred_id = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([pred_id])[0]


In [42]:
plot = input("Enter movie plot:\n")
print("Predicted Genre:", predict_genre(plot))


Enter movie plot:
After losing his family to a violent crime, a former hitman comes out of retirement to seek vengeance. He faces relentless enemies, corrupt officials, and old rivals, relying on his skills, wit, and unbreakable determination. His journey blurs the line between justice and revenge, leaving him questioning what he has become
Predicted Genre: drama
