# Step 1: Import Dataset

In [21]:
import pandas as pd

In [22]:
with open('train_data.txt','r',encoding='utf-8') as file:
    lines=file.readlines()
    
data=[]
for line in lines:
    parts=line.strip().split(' ::: ')
    data.append(parts)
    
train=pd.DataFrame(data,columns=['ID','Title','Genre','Description'])

train.to_csv('train.csv',index=False)

In [23]:
with open('test_data_solution.txt','r',encoding='utf-8') as file:
    lines=file.readlines()
    
data=[]
for line in lines:
    parts=line.strip().split(' ::: ')
    data.append(parts)
    
test=pd.DataFrame(data,columns=['ID','Title','Genre','Description'])

test.to_csv('test.csv',index=False)

In [24]:
train.shape

(54214, 4)

In [25]:
test.shape

(54200, 4)

In [26]:
train.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [27]:
test.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...


In [28]:
train.dtypes

ID             object
Title          object
Genre          object
Description    object
dtype: object

In [29]:
test.dtypes

ID             object
Title          object
Genre          object
Description    object
dtype: object

# Step 2: EDA

In [30]:
train.isnull().sum()

ID             0
Title          0
Genre          0
Description    0
dtype: int64

In [31]:
test.isnull().sum()

ID             0
Title          0
Genre          0
Description    0
dtype: int64

In [34]:
train.drop(columns=['ID','Title'],axis=1,inplace=True)
test.drop(columns=['ID','Title'],axis=1,inplace=True)

In [35]:
train.head()

Unnamed: 0,Genre,Description
0,drama,Listening in to a conversation between his doc...
1,thriller,A brother and sister with a past incestuous re...
2,adult,As the bus empties the students for their fiel...
3,drama,To help their unemployed father make ends meet...
4,drama,The film's title refers not only to the un-rec...


In [36]:
test.head()

Unnamed: 0,Genre,Description
0,thriller,"L.R. Brane loves his life - his car, his apart..."
1,comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,documentary,One year in the life of Albin and his family o...
3,drama,"His father has died, he hasn't spoken with his..."
4,drama,Before he was known internationally as a marti...


In [37]:
df=pd.concat((train,test))
df.head()

Unnamed: 0,Genre,Description
0,drama,Listening in to a conversation between his doc...
1,thriller,A brother and sister with a past incestuous re...
2,adult,As the bus empties the students for their fiel...
3,drama,To help their unemployed father make ends meet...
4,drama,The film's title refers not only to the un-rec...


In [38]:
import nltk
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords

stop_words = set(stopwords.words('english'))

def preprocess(s):
    # 1. Convert to lowercase
    s = s.lower()
    
    # 2. Remove punctuation
    s = s.translate(str.maketrans('', '', string.punctuation))
    
    # 3. Tokenize using gensim's simple_preprocess
    tokens = simple_preprocess(s, deacc=True)  # deacc=True removes punctuation as well
    
    # 4. Remove stopwords using gensim
    tokens = [word for word in tokens if word not in stop_words]
    
    # 5. Join tokens back into a single string
    processed_text = ' '.join(tokens)
    
    return processed_text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tanee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
df['Description_new']=df['Description'].apply(preprocess)

In [40]:
df.head()

Unnamed: 0,Genre,Description,Description_new
0,drama,Listening in to a conversation between his doc...,listening conversation doctor parents yearold ...
1,thriller,A brother and sister with a past incestuous re...,brother sister past incestuous relationship cu...
2,adult,As the bus empties the students for their fiel...,bus empties students field trip museum natural...
3,drama,To help their unemployed father make ends meet...,help unemployed father make ends meet edith tw...
4,drama,The film's title refers not only to the un-rec...,films title refers unrecovered bodies ground z...


In [41]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Genre_encoded'] = label_encoder.fit_transform(df['Genre'])


class_names= list(label_encoder.classes_)

In [42]:
df.head()

Unnamed: 0,Genre,Description,Description_new,Genre_encoded
0,drama,Listening in to a conversation between his doc...,listening conversation doctor parents yearold ...,8
1,thriller,A brother and sister with a past incestuous re...,brother sister past incestuous relationship cu...,24
2,adult,As the bus empties the students for their fiel...,bus empties students field trip museum natural...,1
3,drama,To help their unemployed father make ends meet...,help unemployed father make ends meet edith tw...,8
4,drama,The film's title refers not only to the un-rec...,films title refers unrecovered bodies ground z...,8


# Step 3: Train models

In [43]:
from sklearn.model_selection import train_test_split
x = df["Description_new"]
y = df["Genre"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5)

In [44]:
df.head()

Unnamed: 0,Genre,Description,Description_new,Genre_encoded
0,drama,Listening in to a conversation between his doc...,listening conversation doctor parents yearold ...,8
1,thriller,A brother and sister with a past incestuous re...,brother sister past incestuous relationship cu...,24
2,adult,As the bus empties the students for their fiel...,bus empties students field trip museum natural...,1
3,drama,To help their unemployed father make ends meet...,help unemployed father make ends meet edith tw...,8
4,drama,The film's title refers not only to the un-rec...,films title refers unrecovered bodies ground z...,8


In [45]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorize = vectorizer.fit_transform(x_train)
X_test_vectorize =  vectorizer.transform(x_test)

# 1.MultinomialNB()

In [46]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
# mnb = MultinomialNB()
# mnb.fit(x_train_vectorize ,y_train)
# print("Model Score on Training data",mnb.score(x_train_vectorize ,y_train))
# print("Model Score on Training data",mnb.score(x_test_vectorize ,y_test))
# y_pred = mnb.predict(x_test_vectorize)

# print(classification_report(y_pred ,y_test))



nb_model = MultinomialNB()
nb_model.fit(X_train_vectorize, y_train)
y_pred_nb = nb_model.predict(X_test_vectorize)
nb_report = classification_report(y_test, y_pred_nb)
print("Naive Bayes Report:")
print(nb_report)
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes Report:
              precision    recall  f1-score   support

      action       0.59      0.04      0.07      1303
       adult       0.72      0.02      0.04       615
   adventure       0.53      0.07      0.12       741
   animation       0.00      0.00      0.00       503
   biography       0.00      0.00      0.00       266
      comedy       0.53      0.46      0.49      7393
       crime       0.00      0.00      0.00       504
 documentary       0.57      0.90      0.70     13205
       drama       0.47      0.83      0.60     13680
      family       0.25      0.00      0.00       828
     fantasy       0.00      0.00      0.00       321
   game-show       1.00      0.17      0.30       196
     history       0.00      0.00      0.00       231
      horror       0.76      0.31      0.44      2176
       music       0.88      0.07      0.13       724
     musical       1.00      0.00      0.01       263
     mystery       0.00      0.00      0.00       301
       

  _warn_prf(average, modifier, msg_start, len(result))


# 2.Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression()
# lr.fit(x_train_vectorize ,y_train)
# print("Model Score on Training data",lr.score(x_train_vectorize ,y_train))
# print("Model Score on Training data",lr.score(x_test_vectorize,y_test))
# y_pred = lr.predict(x_test_vectorize)
# print(classification_report(y_pred ,y_test))


lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_vectorize, y_train)
y_pred_lr = lr_model.predict(X_test_vectorize)
lr_report = classification_report(y_test, y_pred_lr)
print("Logistic Regression Report:")
print(lr_report)

Logistic Regression Report:
              precision    recall  f1-score   support

      action       0.42      0.33      0.37      1303
       adult       0.64      0.40      0.49       615
   adventure       0.40      0.23      0.29       741
   animation       0.36      0.15      0.21       503
   biography       0.03      0.00      0.01       266
      comedy       0.52      0.58      0.55      7393
       crime       0.25      0.09      0.13       504
 documentary       0.71      0.78      0.74     13205
       drama       0.57      0.69      0.62     13680
      family       0.36      0.18      0.24       828
     fantasy       0.28      0.10      0.14       321
   game-show       0.87      0.60      0.71       196
     history       0.12      0.03      0.04       231
      horror       0.62      0.61      0.62      2176
       music       0.59      0.51      0.55       724
     musical       0.22      0.08      0.12       263
     mystery       0.22      0.07      0.11       301

# 3.LinearSVC

In [49]:
from sklearn.svm import LinearSVC


svm = LinearSVC()
svm.fit(X_train_vectorize ,y_train)
print("Model Score on Training data",svm.score(X_train_vectorize ,y_train))
print("Model Score on Training data",svm.score(X_test_vectorize ,y_test))
y_pred = svm.predict(X_test_vectorize)
print(classification_report(y_pred ,y_test))



Model Score on Training data 0.999354326931946
Model Score on Training data 0.5182356522220377
              precision    recall  f1-score   support

      action       0.28      0.32      0.30      1142
       adult       0.39      0.53      0.45       457
   adventure       0.22      0.28      0.25       587
   animation       0.13      0.21      0.16       311
   biography       0.02      0.04      0.03       143
      comedy       0.52      0.49      0.50      7962
       crime       0.11      0.16      0.13       323
 documentary       0.72      0.69      0.71     13688
       drama       0.60      0.55      0.57     14924
      family       0.18      0.27      0.21       554
     fantasy       0.09      0.13      0.11       239
   game-show       0.62      0.71      0.66       171
     history       0.03      0.05      0.03       126
      horror       0.57      0.56      0.57      2239
       music       0.48      0.52      0.50       677
     musical       0.07      0.11      0