In [1]:
import numpy as np
import pandas as pd
import requests

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
base_url = "https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page="

In [3]:
df = pd.DataFrame(columns=['name','description','genre'])

In [4]:
resp = requests.get(url='https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US')
data = resp.json()

genre_dict = {}
for i in data['genres']:
    genre_dict[i['id']] = i['name']

genre_dict

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [5]:
for i in range(1,15):
    resp = requests.get(url=base_url+str(i))
    data = resp.json()
    for item in data['results']:
        genre_str = ""
        for genre_id in item['genre_ids']:
            genre_str+= genre_dict[genre_id] + ", "
            
        if item['original_language'] == 'en':
            temp = pd.DataFrame([[item['original_title'],item['overview'],genre_str]],
                               columns=['name','description','genre'])
            df = pd.concat([df,temp],axis=0)

In [6]:
df

Unnamed: 0,name,description,genre
0,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime,"
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama, Crime,"
0,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime,"
0,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War,"
0,12 Angry Men,The defense and the prosecution have rested an...,"Drama,"
...,...,...,...
0,The Wolf of Wall Street,A New York stockbroker refuses to cooperate in...,"Crime, Drama, Comedy,"
0,Judgment at Nuremberg,"In 1947, four German judges who served on the ...","Drama, History, War,"
0,Room,"Held captive for 7 years in an enclosed space,...","Drama, Thriller,"
0,Requiem for a Dream,The hopes and dreams of four ambitious people ...,"Crime, Drama,"


In [7]:
df.reset_index(drop=True,inplace=True)

In [8]:
df

Unnamed: 0,name,description,genre
0,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime,"
1,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama, Crime,"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime,"
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War,"
4,12 Angry Men,The defense and the prosecution have rested an...,"Drama,"
...,...,...,...
145,The Wolf of Wall Street,A New York stockbroker refuses to cooperate in...,"Crime, Drama, Comedy,"
146,Judgment at Nuremberg,"In 1947, four German judges who served on the ...","Drama, History, War,"
147,Room,"Held captive for 7 years in an enclosed space,...","Drama, Thriller,"
148,Requiem for a Dream,The hopes and dreams of four ambitious people ...,"Crime, Drama,"


## Text preprocessing

### Lowercasing

In [9]:
for col in df.columns:
    df[col] = df[col].str.lower()

In [10]:
df

Unnamed: 0,name,description,genre
0,the godfather,"spanning the years 1945 to 1955, a chronicle o...","drama, crime,"
1,the shawshank redemption,framed in the 1940s for the double murder of h...,"drama, crime,"
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime,"
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war,"
4,12 angry men,the defense and the prosecution have rested an...,"drama,"
...,...,...,...
145,the wolf of wall street,a new york stockbroker refuses to cooperate in...,"crime, drama, comedy,"
146,judgment at nuremberg,"in 1947, four german judges who served on the ...","drama, history, war,"
147,room,"held captive for 7 years in an enclosed space,...","drama, thriller,"
148,requiem for a dream,the hopes and dreams of four ambitious people ...,"crime, drama,"


### Remove urls 

In [11]:
import re

In [12]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+www\.\S+')
    return pattern.sub(r'',text)

In [13]:
df['description'] = df['description'].apply(remove_url) 

In [14]:
df

Unnamed: 0,name,description,genre
0,the godfather,"spanning the years 1945 to 1955, a chronicle o...","drama, crime,"
1,the shawshank redemption,framed in the 1940s for the double murder of h...,"drama, crime,"
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime,"
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war,"
4,12 angry men,the defense and the prosecution have rested an...,"drama,"
...,...,...,...
145,the wolf of wall street,a new york stockbroker refuses to cooperate in...,"crime, drama, comedy,"
146,judgment at nuremberg,"in 1947, four german judges who served on the ...","drama, history, war,"
147,room,"held captive for 7 years in an enclosed space,...","drama, thriller,"
148,requiem for a dream,the hopes and dreams of four ambitious people ...,"crime, drama,"


### Remove punctuation 

In [15]:
import string

In [16]:
exclude = string.punctuation
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

In [17]:
df['description'] = df['description'].apply(remove_punc) 

In [18]:
df

Unnamed: 0,name,description,genre
0,the godfather,spanning the years 1945 to 1955 a chronicle of...,"drama, crime,"
1,the shawshank redemption,framed in the 1940s for the double murder of h...,"drama, crime,"
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime,"
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war,"
4,12 angry men,the defense and the prosecution have rested an...,"drama,"
...,...,...,...
145,the wolf of wall street,a new york stockbroker refuses to cooperate in...,"crime, drama, comedy,"
146,judgment at nuremberg,in 1947 four german judges who served on the b...,"drama, history, war,"
147,room,held captive for 7 years in an enclosed space ...,"drama, thriller,"
148,requiem for a dream,the hopes and dreams of four ambitious people ...,"crime, drama,"


### Spelling Correction

In [19]:
from textblob import TextBlob

In [20]:
def spell_correct(text):
    textBlb = TextBlob(text)
    return textBlb.correct().string

In [21]:
spell_correct(df.description[0])

'spanning the years 1945 to 1955 a chronicle of the sectional italianamerican corleone crime family when organized crime family patriarch veto corleone barely survives an attempt on his life his youngest son michael steps in to take care of the would tillers launching a campaign of bloody revenge'

In [22]:
df['description'] = df['description'].apply(spell_correct)

### Removing stopwords 

In [23]:
from nltk.corpus import stopwords
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [24]:
remove_stopwords(df.description[0])

'spanning  years 1945  1955  chronicle   sectional italianamerican corleone crime family  organized crime family patriarch veto corleone barely survives  attempt   life  youngest son michael steps   take care   would tillers launching  campaign  bloody revenge'

In [25]:
df.description = df['description'].apply(remove_stopwords)

In [26]:
df

Unnamed: 0,name,description,genre
0,the godfather,spanning years 1945 1955 chronicle sectio...,"drama, crime,"
1,the shawshank redemption,framed 1940s double murder wife lover ...,"drama, crime,"
2,the godfather part ii,continuing sage corleone crime family you...,"drama, crime,"
3,schindler's list,true story businessman oscar schindler save...,"drama, history, war,"
4,12 angry men,defense prosecution rested jury filing ...,"drama,"
...,...,...,...
145,the wolf of wall street,new york stockbroker refuses cooperate lar...,"crime, drama, comedy,"
146,judgment at nuremberg,1947 four german judges served bench nas...,"drama, history, war,"
147,room,held captive 7 years enclosed space woman ...,"drama, thriller,"
148,requiem for a dream,hopes dreams four ambitious people shatter...,"crime, drama,"


### Tokenization

In [27]:
import spacy 
nlp = spacy.load('en_core_web_sm')

In [28]:
df.description.apply(nlp)

0      (spanning,  , years, 1945,  , 1955,  , chronic...
1      (framed,   , 1940s,   , double, murder,   , wi...
2      (  , continuing, sage,   , corleone, crime, fa...
3      ( , true, story,   , businessman, oscar, schin...
4      ( , defense,   , prosecution,  , rested,   , j...
                             ...                        
145    ( , new, york, stockbroker, refuses,  , cooper...
146    ( , 1947, four, german, judges,  , served,   ,...
147    (held, captive,  , 7, years,   , enclosed, spa...
148    ( , hopes,  , dreams,  , four, ambitious, peop...
149    (dutch, maxwell,  , suburban, dad, overlooked,...
Name: description, Length: 150, dtype: object

### Lemmatization 

In [43]:
import nltk 
from nltk.stem import WordNetLemmatizer 
wordnet_lemmatizer = WordNetLemmatizer()
punctuations = "?:!.,;"
def lemmatize(text):
    sentence_words = nltk.word_tokenize(text)
    for word in sentence_words:
        if word in punctuations:
            sentence_words.remove(word)
    lemmatized_text = ""
    for word in sentence_words:
        lemmatized_text += wordnet_lemmatizer.lemmatize(word,pos='v') + " "
    return lemmatized_text


In [44]:
lemmatize('''He was running and eating at same time. He has bad habit of swimming after playing 
long hours in Sun.''')

'He be run and eat at same time He have bad habit of swim after play long hours in Sun '

In [45]:
df['description'] = df['description'].apply(lemmatize)

In [46]:
df

Unnamed: 0,name,description,genre
0,the godfather,span years 1945 1955 chronicle sectional itali...,"drama, crime,"
1,the shawshank redemption,frame 1940s double murder wife lover stand ban...,"drama, crime,"
2,the godfather part ii,continue sage corleone crime family young veto...,"drama, crime,"
3,schindler's list,true story businessman oscar schindler save th...,"drama, history, war,"
4,12 angry men,defense prosecution rest jury file jury room d...,"drama,"
...,...,...,...
145,the wolf of wall street,new york stockbroker refuse cooperate large se...,"crime, drama, comedy,"
146,judgment at nuremberg,1947 four german judge serve bench nasi regime...,"drama, history, war,"
147,room,hold captive 7 years enclose space woman young...,"drama, thriller,"
148,requiem for a dream,hop dream four ambitious people shatter drug a...,"crime, drama,"
