<a href="https://colab.research.google.com/github/invarrow/dk-tech/blob/main/movie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
os.environ['KAGGLE_CONFIG_DIR']='/content'

In [1]:
!kaggle datasets download -d hijest/genre-classification-dataset-imdb

Dataset URL: https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb
License(s): other
Downloading genre-classification-dataset-imdb.zip to /content
 89% 37.0M/41.7M [00:02<00:00, 18.1MB/s]
100% 41.7M/41.7M [00:02<00:00, 18.8MB/s]


In [3]:
!pip install pandas numpy scikit-learn nltk





In [4]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load the training data
train_data_path = 'train_data.txt'
test_data_path = 'test_data.txt'

# Preprocessing function
def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase and split into words
    words = text.lower().split()
    # Remove stop words and apply stemming
    stemmer = nltk.PorterStemmer()
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Join the words back into a single string
    return ' '.join(words)

# Load and preprocess the train data
train_data = []
with open(train_data_path, 'r') as file:
    for line in file:
        parts = line.strip().split(' ::: ')
        if len(parts) == 4:
            train_data.append({
                'ID': parts[0],
                'TITLE': parts[1],
                'GENRE': parts[2],
                'DESCRIPTION': preprocess_text(parts[3])
            })

train_df = pd.DataFrame(train_data)
train_df.head()

# Load and preprocess the test data
test_data = []
with open(test_data_path, 'r') as file:
    for line in file:
        parts = line.strip().split(' ::: ')
        if len(parts) == 3:
            test_data.append({
                'ID': parts[0],
                'TITLE': parts[1],
                'DESCRIPTION': preprocess_text(parts[2])
            })

test_df = pd.DataFrame(test_data)
test_df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,ID,TITLE,DESCRIPTION
0,1,Edgar's Lunch (1998),l r brane love life car apart job especi girlf...
1,2,La guerra de papá (1977),spain march quico naughti child three belong w...
2,3,Off the Beaten Track (2010),one year life albin famili shepherd north tran...
3,4,Meu Amigo Hindu (2015),father die spoken brother year seriou cancer d...
4,5,Er nu zhai (1955),known intern martial art superstar bruce lee a...


In [6]:
# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train = vectorizer.fit_transform(train_df['DESCRIPTION'])
y_train = train_df['GENRE']

# Transform the test data
X_test = vectorizer.transform(test_df['DESCRIPTION'])


In [7]:
# Train the classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Make predictions on the test data
test_df['PREDICTED_GENRE'] = classifier.predict(X_test)
test_df.head()


Unnamed: 0,ID,TITLE,DESCRIPTION,PREDICTED_GENRE
0,1,Edgar's Lunch (1998),l r brane love life car apart job especi girlf...,drama
1,2,La guerra de papá (1977),spain march quico naughti child three belong w...,drama
2,3,Off the Beaten Track (2010),one year life albin famili shepherd north tran...,documentary
3,4,Meu Amigo Hindu (2015),father die spoken brother year seriou cancer d...,drama
4,5,Er nu zhai (1955),known intern martial art superstar bruce lee a...,drama


In [8]:
# Define a single description for testing
single_description = """
Emily Burns is being held captive in a room with no idea as to why or how she got there.
Determined to escape and return to her daughter and husband, Emily discovers clues within
the room that help explain what she's doing there. They even provide clues about who she is...
but will they help her escape?
"""

# Preprocess the description
preprocessed_description = preprocess_text(single_description)


In [15]:

single_description_tfidf = vectorizer.transform([preprocessed_description])


In [14]:

predicted_genre = classifier.predict(single_description_tfidf)

print(f"Predicted Genre: {predicted_genre[0]}")


Predicted Genre: short
