In [1]:
import time
start_time = time.time()


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import re

In [3]:
#Read the file and set the appropriate headers
df = pd.read_csv("test_data_solution.txt",sep=" ::: ",engine="python",header=None)
df.columns = ["ID", "Title_Year", "Genre", "Description"]
df.head()

Unnamed: 0,ID,Title_Year,Genre,Description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...


In [4]:
#Removed Title_year because we can't predict genre by a movie name and so it's not relevant
df.drop(columns='Title_Year',axis=1,inplace=True)
df

Unnamed: 0,ID,Genre,Description
0,1,thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,documentary,One year in the life of Albin and his family o...
3,4,drama,"His father has died, he hasn't spoken with his..."
4,5,drama,Before he was known internationally as a marti...
...,...,...,...
54195,54196,horror,"Covering multiple genres, Tales of Light & Dar..."
54196,54197,western,As Alice and Cora Munro attempt to find their ...
54197,54198,adult,"A movie 169 years in the making. Oliver Twist,..."
54198,54199,drama,"Popular, but mysterious rock D.J Mike Mallard ..."


In [5]:
#cleaning text before vectorizing
def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)      # Remove numbers
    return text.lower()

df["Description_clean"] = df["Description"].apply(lambda x: clean_text(str(x)))

In [6]:
def custom_tokenizer(text):
    text = re.sub(r"[^\w\s\d+]", "", text)  # Remove punctuation
    return text.lower().split()
#To get important word features based on tf-idf score
from sklearn.feature_extraction.text import TfidfVectorizer
descriptions = df["Description"].fillna("")
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer,max_features=750,       
    stop_words='english',    
    ngram_range=(1, 2))
X_tfidf  = vectorizer.fit_transform(descriptions)
feature_names=vectorizer.get_feature_names_out()
print(feature_names)
print(X_tfidf.toarray())



['2' '3' 'abandoned' 'able' 'accident' 'act' 'action' 'actor' 'actors'
 'actress' 'actually' 'adventure' 'affair' 'africa' 'african' 'age'
 'agent' 'ago' 'air' 'alive' 'america' 'american' 'ancient' 'angeles'
 'answer' 'answers' 'apart' 'apartment' 'appears' 'area' 'army' 'arrives'
 'art' 'artist' 'artists' 'asks' 'attack' 'attempt' 'attempts' 'attention'
 'audience' 'away' 'baby' 'bad' 'band' 'bank' 'based' 'battle' 'beautiful'
 'beauty' 'begin' 'beginning' 'begins' 'believe' 'believes' 'best'
 'best friend' 'better' 'big' 'biggest' 'birth' 'black' 'blood' 'bob'
 'body' 'book' 'born' 'boss' 'boy' 'boyfriend' 'boys' 'break' 'breaks'
 'bring' 'brings' 'british' 'broken' 'brother' 'brothers' 'brought'
 'building' 'business' 'california' 'called' 'came' 'camera' 'camp' 'car'
 'care' 'career' 'case' 'cast' 'caught' 'center' 'century' 'challenge'
 'challenges' 'chance' 'change' 'changed' 'changes' 'character'
 'characters' 'child' 'childhood' 'children' 'choice' 'christmas' 'church'
 'city'

In [7]:
# Convert sparse matrix to DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=feature_names)
tfidf_df["Genre"] = df["Genre"].values

tfidf_df

Unnamed: 0,2,3,abandoned,able,accident,act,action,actor,actors,actress,...,years,years ago,years later,york,young,young man,young woman,younger,youth,Genre
0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,thriller
1,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.233346,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,comedy
2,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,documentary
3,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.115811,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,drama
4,0.000000,0.000000,0.0,0.0,0.184279,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,drama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54195,0.129836,0.130829,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.074581,0.0,0.0,0.0,0.143175,0.0,0.125709,0.0,0.0,horror
54196,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,western
54197,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.203199,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,adult
54198,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.159184,0.0,0.000000,0.0,0.0,drama


In [8]:
#using top k method to retrieve k most important words locally across a genre
top_k_per_genre = {}

for genre in tfidf_df["Genre"].unique():
    genre_subset = tfidf_df[tfidf_df["Genre"] == genre].drop(columns=["Genre"])
    mean_scores = genre_subset.mean(axis=0)
    top_k = mean_scores.sort_values(ascending=False).head(50)
    top_k_per_genre[genre] = top_k.index.tolist()

In [9]:
top_k_per_genre.keys()

dict_keys(['thriller', 'comedy', 'documentary', 'drama', 'horror', 'short', 'western', 'family', 'sport', 'romance', 'war', 'game-show', 'biography', 'adult', 'talk-show', 'action', 'music', 'crime', 'animation', 'sci-fi', 'adventure', 'reality-tv', 'fantasy', 'mystery', 'history', 'news', 'musical'])

In [10]:
# Flatten all top-K terms into a single set
custom_vocab = set()
for terms in top_k_per_genre.values():
    custom_vocab.update(terms)

# Re-vectorize using only these terms
vectorizer = TfidfVectorizer(vocabulary=list(custom_vocab))
X_filtered = vectorizer.fit_transform(df["Description"])


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_filtered, df["Genre"], test_size=0.2, random_state=42)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score

model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))


0.27518450184501847


In [13]:
end_time = time.time()
total_time = end_time - start_time
print(f"Notebook executed in {total_time:.2f} seconds")


Notebook executed in 27.90 seconds
