## Text Preprocessing for NLP

In [180]:
# Importing Required Libraries
import pandas as pd
import requests
import nltk
import re

In [185]:
# Create a empty DataFrame
df = pd.DataFrame()

In [186]:
# Fetch genre
response = requests.get('https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US')

In [184]:
genres = response.json()['genres']

### Fetching Movie Data from IMDB website via API

In [189]:
for i in range(1,458):
    dt = requests.get('https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page={}'.format(i))
    temp_df = pd.DataFrame(dt.json()['results'])[['title','overview','genre_ids']]
    df = pd.concat([df, temp_df], ignore_index=True)

In [192]:
df.head()

Unnamed: 0,title,overview,genre_ids
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[18, 80]"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]"
4,12 Angry Men,The defense and the prosecution have rested an...,[18]


In [193]:
# Fill genre name basis of genre id
k = 0
for i in df['genre_ids']:
    for j in genres:
        if j.get('id') in i:
            row = df['genre_ids'][k]
            id = row.index(j.get('id'))
            df['genre_ids'][k][id] = j.get('name')
    k+=1

In [194]:
df.head(4)

Unnamed: 0,title,overview,genre_ids
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[Drama, Crime]"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[Drama, Crime]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[Drama, Crime]"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[Drama, History, War]"


In [195]:
# List of string convert into string
df['genre_ids'] = df['genre_ids'].apply(lambda x: " ".join(x))
df.head()

Unnamed: 0,title,overview,genre_ids
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,Drama Crime
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",Drama Crime
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,Drama Crime
3,Schindler's List,The true story of how businessman Oskar Schind...,Drama History War
4,12 Angry Men,The defense and the prosecution have rested an...,Drama


In [196]:
# Lowercasing the Data
df['title'] = df['title'].apply(lambda x: x.lower())
df['overview'] = df['overview'].apply(lambda x: x.lower())
df['genre_ids'] = df['genre_ids'].apply(lambda x: x.lower())

In [197]:
df.head()

Unnamed: 0,title,overview,genre_ids
0,the shawshank redemption,framed in the 1940s for the double murder of h...,drama crime
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...",drama crime
2,the godfather part ii,in the continuing saga of the corleone crime f...,drama crime
3,schindler's list,the true story of how businessman oskar schind...,drama history war
4,12 angry men,the defense and the prosecution have rested an...,drama


In [198]:
# Rename columns name
df.rename(columns={'title':'Name','overview':'Description','genre_ids':'Genre'},inplace=True)

In [199]:
df.head(4)

Unnamed: 0,Name,Description,Genre
0,the shawshank redemption,framed in the 1940s for the double murder of h...,drama crime
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...",drama crime
2,the godfather part ii,in the continuing saga of the corleone crime f...,drama crime
3,schindler's list,the true story of how businessman oskar schind...,drama history war


In [200]:
# Remove HTML tags with the help of RegEX Library
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [201]:
df['Name'] = df['Name'].apply(lambda x: remove_html_tags(x))
df['Description'] = df['Description'].apply(lambda x: remove_html_tags(x))

In [202]:
# Remove URL with the help of RegEX Library
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

In [203]:
df['Name'] = df['Name'].apply(lambda x: remove_urls(x))
df['Description'] = df['Description'].apply(lambda x: remove_urls(x))

In [204]:
# Remove Punctuation with the help of RegEX Library
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

In [205]:
df['Name'] = df['Name'].apply(lambda x: remove_punctuation(x))
df['Description'] = df['Description'].apply(lambda x: remove_punctuation(x))

In [206]:
df.head(3)

Unnamed: 0,Name,Description,Genre
0,the shawshank redemption,framed in the 1940s for the double murder of h...,drama crime
1,the godfather,spanning the years 1945 to 1955 a chronicle of...,drama crime
2,the godfather part ii,in the continuing saga of the corleone crime f...,drama crime


In [207]:
# Import nltk Library for removing stopwords
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [208]:
# Function of removing stopwords
def remove_stopwords(text):
    str = []
    for i in text.split():
        if i not in stopwords:
            str.append(i)
    return " ".join(str)

In [209]:
df['Name'] = df['Name'].apply(lambda x: remove_stopwords(x))
df['Description'] = df['Description'].apply(lambda x: remove_stopwords(x))
df['Genre'] = df['Genre'].apply(lambda x: remove_stopwords(x))

In [210]:
# Replacing Quatation
df['Name'].apply(lambda x: x.replace('"',""))
df['Description'].apply(lambda x: x.replace('"',""))

0       framed 1940s double murder wife lover upstandi...
1       spanning years 1945 1955 chronicle fictional i...
2       continuing saga corleone crime family young vi...
3       true story businessman oskar schindler saved t...
4       defense prosecution rested jury filing jury ro...
                              ...                        
9125    filmmaking team behind hits scary movie date m...
9126    year 3000 man match psychlos greedy manipulati...
9127    set island coast techno rave party attracts di...
9128    18th birthday goku receives mystical dragonbal...
9129    platoon eagles vultures attacks residents smal...
Name: Description, Length: 9130, dtype: object

In [211]:
# Concatenate all columns into single columns
concatenated = df['Name'] +" "+ df['Description'] +" "+ df['Genre']
concatenated

0       shawshank redemption framed 1940s double murde...
1       godfather spanning years 1945 1955 chronicle f...
2       godfather part ii continuing saga corleone cri...
3       schindlers list true story businessman oskar s...
4       12 angry men defense prosecution rested jury f...
                              ...                        
9125    disaster movie filmmaking team behind hits sca...
9126    battlefield earth year 3000 man match psychlos...
9127    house dead set island coast techno rave party ...
9128    dragonball evolution 18th birthday goku receiv...
9129    birdemic shock terror platoon eagles vultures ...
Length: 9130, dtype: object

In [221]:
new_df = pd.DataFrame({'Movies_Detail':concatenated})
new_df.head(4)

Unnamed: 0,Movies_Detail
0,shawshank redemption framed 1940s double murde...
1,godfather spanning years 1945 1955 chronicle f...
2,godfather part ii continuing saga corleone cri...
3,schindlers list true story businessman oskar s...


In [222]:
# Import spacy library for tokenize text
import spacy
nlp = spacy.load("en_core_web_sm")

In [223]:
# Function of Tokenize of textual data
def tekenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

In [225]:
new_df['Movies_Detail'] = new_df['Movies_Detail'].apply(lambda x: tekenize_text(x))

In [226]:
new_df.head(4)

Unnamed: 0,Movies_Detail
0,"[shawshank, redemption, framed, 1940s, double,..."
1,"[godfather, spanning, years, 1945, 1955, chron..."
2,"[godfather, part, ii, continuing, saga, corleo..."
3,"[schindlers, list, true, story, businessman, o..."


In [227]:
# Import PosterStemmer module for stemming
from nltk.stem import PorterStemmer
porter = PorterStemmer()

In [228]:
# Function of stemming
def stemming(text):
    stemmed_word = [porter.stem(word) for word in text]
    return stemmed_word

In [229]:
new_df['Movies_Detail'] = new_df['Movies_Detail'].apply(lambda x: stemming(x))

In [232]:
new_df.head(4)

Unnamed: 0,Movies_Detail
0,"[shawshank, redempt, frame, 1940, doubl, murde..."
1,"[godfath, span, year, 1945, 1955, chronicl, fi..."
2,"[godfath, part, ii, continu, saga, corleon, cr..."
3,"[schindler, list, true, stori, businessman, os..."


### Text Cleaning for NLP

# Final Assignment