In [50]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm


In [None]:
generes=['action','adult','comedy','adventure','animation','romance','biography','family','crime','documentary']

In [47]:
unknown_genre='unknown'

In [None]:
df_train=pd.read_csv('train_data.txt',sep=':::',header=None,names=['SNO','MOVIE','GENER','REVIEW'])
df_train.head()

  df_train=pd.read_csv('train_data.txt',sep=':::',header=None,names=['SNO','MOVIE','GENER','REVIEW'])


Unnamed: 0,SNO,MOVIE,GENER,REVIEW
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [None]:
x_train=df_train['REVIEW'].astype(str).apply(lambda doc: doc.lower())
genre_labels = [genre.split(', ') for genre in df_train['GENER']]
x_train.head()

0     listening in to a conversation between his do...
1     a brother and sister with a past incestuous r...
2     as the bus empties the students for their fie...
3     to help their unemployed father make ends mee...
4     the film's title refers not only to the un-re...
Name: REVIEW, dtype: object

In [None]:
mlb=MultiLabelBinarizer()
y_train=mlb.fit_transform(genre_labels)


In [None]:
tfidf_vectorizer=TfidfVectorizer(max_features=7000)

In [None]:
x_train_t=tfidf_vectorizer.fit_transform(x_train)

In [None]:
nb=MultinomialNB()
multi_output_classifier=MultiOutputClassifier(nb)
multi_output_classifier.fit(x_train_t,y_train)

In [None]:
df_test=pd.read_csv('test_data.txt',sep=':::',header=None,names=['SNO.', 'MOVIE', 'REVIEW'], engine='python')
df_test.head()

Unnamed: 0,SNO.,MOVIE,REVIEW
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


In [None]:
x_test=df_test['REVIEW'].astype(str).apply(lambda doc: doc.lower())
x_test.head()

0     l.r. brane loves his life - his car, his apar...
1     spain, march 1964: quico is a very naughty ch...
2     one year in the life of albin and his family ...
3     his father has died, he hasn't spoken with hi...
4     before he was known internationally as a mart...
Name: REVIEW, dtype: object

In [None]:
X_test_t=tfidf_vectorizer.transform(x_test)
y_pred=multi_output_classifier.predict(X_test_t)

In [None]:
test_movie_names=df_test['MOVIE']
predicted_genres=mlb.inverse_transform(y_pred)
pred_result = pd.DataFrame({'MOVIE': test_movie_names,'PREDICTED_GENRES': predicted_genres})

In [None]:
pred_result.head()

Unnamed: 0,MOVIE,PREDICTED_GENRES
0,Edgar's Lunch (1998),()
1,La guerra de papá (1977),"( drama ,)"
2,Off the Beaten Track (2010),"( documentary ,)"
3,Meu Amigo Hindu (2015),"( drama ,)"
4,Er nu zhai (1955),()


In [None]:
pred_result['PREDICTED_GENRES'] = pred_result['PREDICTED_GENRES'].apply(lambda genres: [unknown_genre] if len(genres) == 0 else genres)

In [None]:
with open("model_built.txt","w",encoding="utf-8") as myfile:
  for _, row in pred_result.iterrows():
    movie=row['MOVIE']
    genre=','.join(row['PREDICTED_GENRES'])
    myfile.write(f"{movie} :::{genre}\n")

In [None]:
y_train_pred = multi_output_classifier.predict(x_train_t)

In [None]:
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, average='micro')
recall = recall_score(y_train, y_train_pred, average='micro')
f1 = f1_score(y_train, y_train_pred, average='micro')

In [None]:
with open("model_built.txt","a",encoding="utf-8") as myfile:
  myfile.write(f"Accuracy:{accuracy*100:}%\n")
  myfile.write(f"Precision:{precision:.2f}\n")
  myfile.write(f"F1-score:{f1:.2f}\n")
