#  Movie Genre Classification using Machine Learning
### Internship Project - MyDailyWork
### Name: Jyothish Reddy

In [35]:
import os
os.listdir()

['.config',
 'test_data_solution.txt',
 'tfidf_vectorizer.pkl',
 'test_data.txt',
 '.ipynb_checkpoints',
 'description.txt',
 'movie_genre_model.pkl',
 'train_data.txt',
 'sample_data']

## 1. Import Libraries

In [36]:
import pandas as pd

train_data = pd.read_csv(
    "train_data.txt",
    sep=":::",
    engine="python",
    names=["ID", "Title", "Genre", "Description"]
)

train_data.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


## 2. Load Dataset

In [37]:
print(train_data.shape)

(54214, 4)


## 3. Feature Selection

In [38]:
X = train_data["Description"]
y = train_data["Genre"]

print(X.head())
print(y.head())

0     Listening in to a conversation between his do...
1     A brother and sister with a past incestuous r...
2     As the bus empties the students for their fie...
3     To help their unemployed father make ends mee...
4     The film's title refers not only to the un-re...
Name: Description, dtype: object
0        drama 
1     thriller 
2        adult 
3        drama 
4        drama 
Name: Genre, dtype: object


## 4. TF-IDF Vectorization

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X_tfidf = vectorizer.fit_transform(X)

print(X_tfidf.shape)

(54214, 5000)


## 5. Train Test Split

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y,
    test_size=0.2,
    random_state=42
)

print(X_train.shape)
print(X_test.shape)

(43371, 5000)
(10843, 5000)


## 6. Model Training

In [41]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

In [42]:
y_pred = model.predict(X_test)

## 7. Model Evaluation

In [43]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.5799133081250576

Classification Report:

               precision    recall  f1-score   support

      action        0.51      0.25      0.34       263
       adult        0.75      0.21      0.33       112
   adventure        0.42      0.14      0.21       139
   animation        0.60      0.09      0.15       104
   biography        0.00      0.00      0.00        61
      comedy        0.51      0.59      0.55      1443
       crime        0.29      0.02      0.04       107
 documentary        0.66      0.85      0.74      2659
       drama        0.54      0.78      0.64      2697
      family        0.39      0.07      0.12       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.94      0.42      0.59        40
     history        0.00      0.00      0.00        45
      horror        0.64      0.56      0.60       431
       music        0.62      0.47      0.54       144
     musical        1.00      0.02      0.04        50
     myste

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 8. Custom Prediction

In [44]:
sample_plot = ["A group of astronauts travel to space and encounter alien life on a distant planet"]

sample_vector = vectorizer.transform(sample_plot)

prediction = model.predict(sample_vector)

print("Predicted Genre:", prediction[0])

Predicted Genre:  sci-fi 


In [45]:
test_data = pd.read_csv(
    "test_data.txt",
    sep=":::",
    engine="python",
    names=["ID", "Title", "Description"]
)

test_solution = pd.read_csv(
    "test_data_solution.txt",
    sep=":::",
    engine="python",
    names=["ID", "Title", "Genre", "Description"]
)

test_data.head()

Unnamed: 0,ID,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


In [46]:
X_test_real = test_data["Description"]

X_test_real_tfidf = vectorizer.transform(X_test_real)

In [47]:
y_test_real_pred = model.predict(X_test_real_tfidf)

In [48]:
y_test_real = test_solution["Genre"]

## 9. Final Test Evaluation

In [49]:
from sklearn.metrics import accuracy_score

real_accuracy = accuracy_score(y_test_real, y_test_real_pred)

print("Final Test Accuracy:", real_accuracy)

Final Test Accuracy: 0.5781918819188192


In [50]:
import pickle

pickle.dump(model, open("movie_genre_model.pkl", "wb"))
pickle.dump(vectorizer, open("tfidf_vectorizer.pkl", "wb"))