In [51]:
# Import all the libraries we'll use

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')  # keeps your output clean and less scary


In [73]:
#   Loading the train_data.txt (uploaded into same folder as notebook)

train_df = pd.read_csv("train_data.txt", sep=" ::: ", engine="python", names=["ID", "Title", "Genre", "Description"])

# Drop unused columns
train_df = train_df.drop(columns=["ID", "Title"])

# Check what it looks like
train_df.head()



Unnamed: 0,Genre,Description
0,drama,Listening in to a conversation between his doc...
1,thriller,A brother and sister with a past incestuous re...
2,adult,As the bus empties the students for their fiel...
3,drama,To help their unemployed father make ends meet...
4,drama,The film's title refers not only to the un-rec...


In [75]:
# Turning the text into numbers with TF-IDF

tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

X = tfidf.fit_transform(train_df["Description"]).toarray()
y = train_df["Genre"]



In [77]:
#   Spliting the data for validation

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [79]:
# Training the Logistic Regression model

model = LogisticRegression()
model.fit(X_train, y_train)


In [95]:
#Evaluate the model on validation set

y_pred = model.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("\n Classification Report:\n", classification_report(y_val, y_pred))


Accuracy: 0.5793599557318085

 Classification Report:
               precision    recall  f1-score   support

      action       0.52      0.25      0.34       263
       adult       0.72      0.21      0.32       112
   adventure       0.45      0.17      0.24       139
   animation       0.67      0.10      0.17       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.59      0.55      1443
       crime       0.29      0.02      0.04       107
 documentary       0.67      0.84      0.74      2659
       drama       0.54      0.78      0.64      2697
      family       0.39      0.07      0.12       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.94      0.42      0.59        40
     history       0.00      0.00      0.00        45
      horror       0.64      0.56      0.60       431
       music       0.63      0.47      0.54       144
     musical       0.50      0.02      0.04        50
     mystery       0.00   

In [81]:
# Save the model and vectorizer

joblib.dump(model, "genre_model.pkl")
joblib.dump(tfidf, "genre_vectorizer.pkl")


['genre_vectorizer.pkl']

In [82]:
#Load test_data.txt

test_df = pd.read_csv("test_data.txt", sep=" ::: ", engine="python", names=["ID", "Title", "Description"])

# Just the descriptions to predict on
test_descriptions = test_df["Description"]


In [83]:
# Predictions

X_test = tfidf.transform(test_descriptions).toarray()
test_predictions = model.predict(X_test)

# Add to dataframe
test_df["Predicted Genre"] = test_predictions

# Show sample predictions
test_df[["Title", "Predicted Genre"]].head(10)


Unnamed: 0,Title,Predicted Genre
0,Edgar's Lunch (1998),short
1,La guerra de papá (1977),drama
2,Off the Beaten Track (2010),documentary
3,Meu Amigo Hindu (2015),drama
4,Er nu zhai (1955),drama
5,Riddle Room (2016),drama
6,L'amica (1969),drama
7,Ina Mina Dika (1989),comedy
8,Equinox Special: Britain's Tornados (2005),documentary
9,Press (2011),drama


In [91]:
# Save to CSV for your report/video/GitHub

test_df.to_csv("test_predictions.csv", index=False)
print(" Predictions saved to test_predictions.csv")


 Predictions saved to test_predictions.csv
