# Developing a Machine Learning Model to Predict Movie Genres
###  [Datasets](https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb)

In [10]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,precision_score

In [119]:
train_df = pd.read_csv("data/train_data.txt",sep=":::",header=None,engine="python")
test_df = pd.read_csv("data/test_data.txt",sep=":::",header=None,engine="python")

In [120]:
train_df = train_df.drop(columns=[0],axis=1)
train_df.head()

Unnamed: 0,1,2,3
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [121]:
test_df = test_df.drop(columns=[0],axis=1)
test_df

Unnamed: 0,1,2
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...
54195,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [122]:
train_df.rename(columns={1:"movie",2:"genre",3:"overview"},inplace=True)
test_df.rename(columns={1:"movie",2:"overview"},inplace=True)

In [123]:
train_df.describe()

Unnamed: 0,movie,genre,overview
count,54214,54214,54214
unique,54214,27,54086
top,Oscar et la dame rose (2009),drama,Grammy - music award of the American academy ...
freq,1,13613,12


In [124]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie     54214 non-null  object
 1   genre     54214 non-null  object
 2   overview  54214 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [125]:
train_df.isna().sum()

movie       0
genre       0
overview    0
dtype: int64

In [126]:
train_df.duplicated().sum()

0

In [127]:
train_df.genre[1]

' thriller '

### Remove WhiteSpace

In [129]:
def remove_whitespace(col):
    return col.strip()

In [132]:
train_df.genre = train_df.genre.apply(remove_whitespace)
train_df.movie = train_df.movie.apply(remove_whitespace)
train_df.overview = train_df.overview.apply(remove_whitespace)

In [136]:
test_df.movie = test_df.movie.apply(remove_whitespace)
test_df.overview = test_df.overview.apply(remove_whitespace)

In [143]:
train_df.head()

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


## Remove Punctuation

In [151]:
def remove_punctuation(col):

    text = "".join([word for word in col if word not in string.punctuation])
    return text
    col = col.lower()
    for i in string.punctuation:
        col = col.replace(i,"")
    return col

In [153]:
test_df.overview = test_df.overview.apply(remove_punctuation)

In [155]:
train_df.overview = train_df.overview.apply(remove_punctuation)

# word tokenize

In [157]:
from nltk.tokenize import word_tokenize

In [160]:
def word_token(col):
    return word_tokenize(col)

In [162]:
train_df.overview = train_df.overview.apply(word_token)

In [163]:
test_df.overview = test_df.overview.apply(word_token)

In [164]:
train_df.head(5)

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),drama,"[Listening, in, to, a, conversation, between, ..."
1,Cupid (1997),thriller,"[A, brother, and, sister, with, a, past, inces..."
2,"Young, Wild and Wonderful (1980)",adult,"[As, the, bus, empties, the, students, for, th..."
3,The Secret Sin (1915),drama,"[To, help, their, unemployed, father, make, en..."
4,The Unrecovered (2007),drama,"[The, films, title, refers, not, only, to, the..."


## StopWords

In [166]:
def remove_stopwords(col):
    l = []
    for i in col:
        if i not in stopwords.words("english"):
          l.append(i)
    return l

In [167]:
train_df.overview = train_df.overview.apply(remove_stopwords)

In [168]:
test_df.overview = test_df.overview.apply(remove_stopwords)

##  Stemming

In [169]:
from nltk.stem import PorterStemmer
ps= PorterStemmer()

In [170]:
def stemming(col):
    l =[]
    for i in col:
        l.append(ps.stem(i))
    return " ".join(l)

In [171]:
train_df.head()

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),drama,"[Listening, conversation, doctor, parents, 10y..."
1,Cupid (1997),thriller,"[A, brother, sister, past, incestuous, relatio..."
2,"Young, Wild and Wonderful (1980)",adult,"[As, bus, empties, students, field, trip, Muse..."
3,The Secret Sin (1915),drama,"[To, help, unemployed, father, make, ends, mee..."
4,The Unrecovered (2007),drama,"[The, films, title, refers, unrecovered, bodie..."


In [172]:
train_df.overview = train_df.overview.apply(stemming)
test_df.overview = test_df.overview.apply(stemming)

##  Export the Preprocessing Data

In [173]:
train_df

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),drama,listen convers doctor parent 10yearold oscar l...
1,Cupid (1997),thriller,a brother sister past incestu relationship cur...
2,"Young, Wild and Wonderful (1980)",adult,as bu empti student field trip museum natur hi...
3,The Secret Sin (1915),drama,to help unemploy father make end meet edith tw...
4,The Unrecovered (2007),drama,the film titl refer unrecov bodi ground zero a...
...,...,...,...
54209,"""Bonino"" (1953)",comedy,thi shortliv nbc live sitcom center bonino wor...
54210,Dead Girls Don't Cry (????),horror,the next gener exploit the sister kapa bay sor...
54211,Ronald Goedemondt: Ze bestaan echt (2008),documentary,ze bestaan echt standup comedi grow face fear ...
54212,Make Your Own Bed (1944),comedy,walter vivian live countri difficult time keep...


In [185]:
test_df.to_csv("data/clean_test_df.csv")

In [187]:
train_df.to_csv("data/clean_train_df.csv")

# Import Preprocessing Data

In [592]:
train_df = pd.read_csv("data/clean_train_df.csv")
test_df= pd.read_csv("data/clean_test_df.csv")

In [593]:
drama = train_df[train_df.genre == "drama"][:200]
documentary =  train_df[train_df.genre =="documentary"][:300]
comedy = train_df[train_df.genre =="comedy"][:300]
short = train_df[train_df.genre =="short"][:300]
thriller = train_df[train_df.genre =="thriller"][:700]
action = train_df[train_df.genre =="action"][:700]
horror = train_df[train_df.genre =="horror"][:500]
western = train_df[train_df.genre =="western"][:200]
family = train_df[train_df.genre =="family"][:300]

In [594]:
train_df =train_df[~train_df.genre.isin(["horror","family","drama", "documentary", "comedy", "short", "thriller", "action", "western","reality-tv","music","romance"])]
train_df = pd.concat([drama,documentary,comedy,short,thriller,action,horror,western,train_df,family])

In [595]:
del train_df["Unnamed: 0"]

In [596]:
del test_df["Unnamed: 0"]

In [597]:
train_df.head(5)

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),drama,listen convers doctor parent 10yearold oscar l...
3,The Secret Sin (1915),drama,to help unemploy father make end meet edith tw...
4,The Unrecovered (2007),drama,the film titl refer unrecov bodi ground zero a...
11,In the Gloaming (1997),drama,danni die aid return home last month alway clo...
14,The Glass Menagerie (1973),drama,amanda wingfield domin children fade gentil ex...


In [602]:
tfidf = TfidfVectorizer(max_features=1000)

In [605]:
x = tfidf.fit_transform(train_df.overview).toarray()
x

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.17464228, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [606]:
y = train_df.genre.values
y

array(['drama', 'drama', 'drama', ..., 'family', 'family', 'family'],
      dtype=object)

In [608]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [609]:
mnb = MultinomialNB()
gnb = GaussianNB()

In [610]:
mnb.fit(x_train,y_train)
gnb.fit(x_train,y_train)

In [611]:
y_pred1 = mnb.predict(x_test)
y_pred2 = gnb.predict(x_test)

In [612]:
accuracy1 = accuracy_score(y_test,y_pred1)
accuracy2 = accuracy_score(y_test,y_pred2)
print(accuracy1)
print(accuracy2)

0.3816711590296496
0.1628032345013477


In [613]:
p1 = precision_score(y_test,y_pred1,average="macro",zero_division=False)
p2 = precision_score(y_test,y_pred2,average="macro")
print(p1)
print(p2)

0.45077715429126447
0.18146287203305436


## Death Note

In [619]:
death_note="An intelligent high school student goes on a secret crusade to eliminate criminals from the world after discovering a notebook capable of killing anyone whose name is written into it."

## Iron Man

In [643]:
iron_man= "Tony Stark. Genius, billionaire, playboy, philanthropist. Son of legendary inventor and weapons contractor Howard Stark. When Tony Stark is assigned to give a weapons presentation to an Iraqi unit led by Lt. Col. James Rhodes, he's given a ride on enemy lines. That ride ends badly when Stark's Humvee that he's riding in is attacked by enemy combatants. He survives - barely - with a chest full of shrapnel and a car battery attached to his heart. In order to survive he comes up with a way to miniaturize the battery and figures out that the battery can power something else. Thus Iron Man is born. He uses the primitive device to escape from the cave in Iraq. Once back home, he then begins work on perfecting the Iron Man suit. But the man who was put in charge of Stark Industries has plans of his own to take over Tonys technology for other matters."

In [655]:
mnb.predict(tfidf.transform([iron_man]))[0]

'action'