## Preparing my own data from API key then performing text preprocessing

In [1]:
import requests
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

API_KEY = "8265bd1679663a7ea12ac168da84d2e8"
BASE_URL = "https://api.themoviedb.org/3/movie/top_rated"

# Create session with retry & backoff
session = requests.Session()
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"]
)
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))

all_movies = []

for page in range(1, 527): 
    try:
        response = session.get(
            BASE_URL,
            params={
                "api_key": API_KEY,
                "language": "en-US",
                "page": page
            },
            timeout=10
        )
        response.raise_for_status()

        data = response.json()
        all_movies.extend(data.get("results", []))

        print(f"Fetched page {page}, total movies: {len(all_movies)}")
        time.sleep(0.5)  # prevents connection reset

    except requests.exceptions.RequestException as e:
        print(f"Failed on page {page}: {e}")
        time.sleep(5)
        continue




Fetched page 1, total movies: 20
Fetched page 2, total movies: 40
Fetched page 3, total movies: 60
Fetched page 4, total movies: 80
Fetched page 5, total movies: 100
Fetched page 6, total movies: 120
Fetched page 7, total movies: 140
Fetched page 8, total movies: 160
Fetched page 9, total movies: 180
Fetched page 10, total movies: 200
Fetched page 11, total movies: 220
Fetched page 12, total movies: 240
Fetched page 13, total movies: 260
Fetched page 14, total movies: 280
Fetched page 15, total movies: 300
Fetched page 16, total movies: 320
Fetched page 17, total movies: 340
Fetched page 18, total movies: 360
Fetched page 19, total movies: 380
Fetched page 20, total movies: 400
Fetched page 21, total movies: 420
Fetched page 22, total movies: 440
Fetched page 23, total movies: 460
Fetched page 24, total movies: 480
Fetched page 25, total movies: 500
Fetched page 26, total movies: 520
Fetched page 27, total movies: 540
Fetched page 28, total movies: 560
Fetched page 29, total movies: 58

In [2]:
# Load into pandas
df = pd.DataFrame(all_movies)

print("Final shape:", df.shape)
df.head()

Final shape: (10000, 14)


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg,"[18, 80]",278,en,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,26.1332,/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg,1994-09-23,The Shawshank Redemption,False,8.7,29336
1,False,/39rtAdg3yx6IDxRg0ZLXefv0aPY.jpg,"[18, 80]",238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",25.2389,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,1972-03-14,The Godfather,False,8.685,22142
2,False,/14icRpwdgpHEHxt7Cn5hr2RQEsF.jpg,"[18, 80]",240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,19.8566,/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg,1974-12-20,The Godfather Part II,False,8.571,13386
3,False,/zb6fM1CX41D9rF9hdgclu0peUmy.jpg,"[18, 36, 10752]",424,en,Schindler's List,The true story of how businessman Oskar Schind...,12.9693,/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg,1993-12-15,Schindler's List,False,8.566,16896
4,False,/w4bTBXcqXc2TUyS5Fc4h67uWbPn.jpg,[18],389,en,12 Angry Men,The defense and the prosecution have rested an...,11.4362,/ow3wq89wM8qd5X7hWKxiRfsFf9C.jpg,1957-04-10,12 Angry Men,False,8.5,9579


### Loading genres

In [3]:
GENRE_URL = "https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US"

genre_response = requests.get(GENRE_URL)
genre_data = genre_response.json()

# Build id -> name mapping
genre_map = {
    genre["id"]: genre["name"]
    for genre in genre_data["genres"]
}


In [4]:
def map_genres(genre_ids):
    return [genre_map.get(gid, "Unknown") for gid in genre_ids]


In [5]:
df["genre_names"] = df["genre_ids"].apply(map_genres)
df

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genre_names
0,False,/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg,"[18, 80]",278,en,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,26.1332,/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg,1994-09-23,The Shawshank Redemption,False,8.700,29336,"[Drama, Crime]"
1,False,/39rtAdg3yx6IDxRg0ZLXefv0aPY.jpg,"[18, 80]",238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",25.2389,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,1972-03-14,The Godfather,False,8.685,22142,"[Drama, Crime]"
2,False,/14icRpwdgpHEHxt7Cn5hr2RQEsF.jpg,"[18, 80]",240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,19.8566,/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg,1974-12-20,The Godfather Part II,False,8.571,13386,"[Drama, Crime]"
3,False,/zb6fM1CX41D9rF9hdgclu0peUmy.jpg,"[18, 36, 10752]",424,en,Schindler's List,The true story of how businessman Oskar Schind...,12.9693,/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg,1993-12-15,Schindler's List,False,8.566,16896,"[Drama, History, War]"
4,False,/w4bTBXcqXc2TUyS5Fc4h67uWbPn.jpg,[18],389,en,12 Angry Men,The defense and the prosecution have rested an...,11.4362,/ow3wq89wM8qd5X7hWKxiRfsFf9C.jpg,1957-04-10,12 Angry Men,False,8.500,9579,[Drama]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,/nKJz9OZNZYk8JcUx6oDEgxcDfos.jpg,"[28, 35]",9876,en,Stop! Or My Mom Will Shoot,A tough police sergeant's mother comes to visi...,3.1187,/yCw4U5FK7C9hVWF2qcLO4EVvkCc.jpg,1992-02-21,Stop! Or My Mom Will Shoot,False,5.320,1072,"[Action, Comedy]"
9996,False,/nGwlHqVQjPjRPPxEiJuaXYoOuIl.jpg,"[10749, 35, 10751]",11982,en,Look Who's Talking Now!,When high-powered executive Samantha LeBon hat...,5.7094,/73JahFiizkMVsrrslXInmNK54nC.jpg,1993-11-05,Look Who's Talking Now!,False,5.319,1063,"[Romance, Comedy, Family]"
9997,False,/t8K8TkiLrrFCBVCIHt8U6nYXwrg.jpg,"[53, 27]",234212,en,Demonic,A police officer and a psychologist investigat...,0.9624,/ne0wAvbmzk72svslxjuFySrXEKw.jpg,2015-02-12,Demonic,False,5.318,514,"[Thriller, Horror]"
9998,False,/9wmzKxkmnsoW5gdvW7Iq1mBcKNJ.jpg,[27],282813,en,The Pyramid,An archaeological team attempt to unlock the s...,1.5155,/joPb4PUPP7xClD6XPiQ7laBEtZf.jpg,2014-12-04,The Pyramid,False,5.317,1123,[Horror]


### Making my desired dataset with the columns name, description and genre

|name|description|genre|
|--|--|--|

In [31]:
df_final = df[['original_title','overview','genre_names']].copy()

In [32]:
df_final

Unnamed: 0,original_title,overview,genre_names
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[Drama, Crime]"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[Drama, Crime]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[Drama, Crime]"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[Drama, History, War]"
4,12 Angry Men,The defense and the prosecution have rested an...,[Drama]
...,...,...,...
9995,Stop! Or My Mom Will Shoot,A tough police sergeant's mother comes to visi...,"[Action, Comedy]"
9996,Look Who's Talking Now!,When high-powered executive Samantha LeBon hat...,"[Romance, Comedy, Family]"
9997,Demonic,A police officer and a psychologist investigat...,"[Thriller, Horror]"
9998,The Pyramid,An archaeological team attempt to unlock the s...,[Horror]


In [33]:
df_final.rename(columns = {'original_title':'Name',
                 'overview':'description'}, inplace = True)

df_final

Unnamed: 0,Name,description,genre_names
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[Drama, Crime]"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[Drama, Crime]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[Drama, Crime]"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[Drama, History, War]"
4,12 Angry Men,The defense and the prosecution have rested an...,[Drama]
...,...,...,...
9995,Stop! Or My Mom Will Shoot,A tough police sergeant's mother comes to visi...,"[Action, Comedy]"
9996,Look Who's Talking Now!,When high-powered executive Samantha LeBon hat...,"[Romance, Comedy, Family]"
9997,Demonic,A police officer and a psychologist investigat...,"[Thriller, Horror]"
9998,The Pyramid,An archaeological team attempt to unlock the s...,[Horror]


In [36]:
df_final["genre"] = df_final["genre_names"].apply(lambda x: ", ".join(x))


In [37]:
df_final

Unnamed: 0,Name,description,genre_names,genre
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[Drama, Crime]","Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[Drama, Crime]","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[Drama, Crime]","Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[Drama, History, War]","Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,[Drama],Drama
...,...,...,...,...
9995,Stop! Or My Mom Will Shoot,A tough police sergeant's mother comes to visi...,"[Action, Comedy]","Action, Comedy"
9996,Look Who's Talking Now!,When high-powered executive Samantha LeBon hat...,"[Romance, Comedy, Family]","Romance, Comedy, Family"
9997,Demonic,A police officer and a psychologist investigat...,"[Thriller, Horror]","Thriller, Horror"
9998,The Pyramid,An archaeological team attempt to unlock the s...,[Horror],Horror


In [40]:
df_final.drop('genre_names', axis = 1, inplace = True)

In [41]:
df_final

Unnamed: 0,Name,description,genre
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,Drama
...,...,...,...
9995,Stop! Or My Mom Will Shoot,A tough police sergeant's mother comes to visi...,"Action, Comedy"
9996,Look Who's Talking Now!,When high-powered executive Samantha LeBon hat...,"Romance, Comedy, Family"
9997,Demonic,A police officer and a psychologist investigat...,"Thriller, Horror"
9998,The Pyramid,An archaeological team attempt to unlock the s...,Horror


In [42]:
df_final.to_csv(
    "dataset/tmdb_movies_final.csv",
    index=False,
    encoding="utf-8"
)


## Performing text preprocessing on the dataset

### Lowercasing

In [44]:
df_final['description'] = df_final['description'].str.lower()

In [46]:
df_final

Unnamed: 0,Name,description,genre
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"Drama, Crime"
3,Schindler's List,the true story of how businessman oskar schind...,"Drama, History, War"
4,12 Angry Men,the defense and the prosecution have rested an...,Drama
...,...,...,...
9995,Stop! Or My Mom Will Shoot,a tough police sergeant's mother comes to visi...,"Action, Comedy"
9996,Look Who's Talking Now!,when high-powered executive samantha lebon hat...,"Romance, Comedy, Family"
9997,Demonic,a police officer and a psychologist investigat...,"Thriller, Horror"
9998,The Pyramid,an archaeological team attempt to unlock the s...,Horror


### Removing HTML tags

In [48]:
import re 

def remove_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [49]:
df_final['description'] = df_final['description'].apply(remove_tags)

In [50]:
df_final

Unnamed: 0,Name,description,genre
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"Drama, Crime"
3,Schindler's List,the true story of how businessman oskar schind...,"Drama, History, War"
4,12 Angry Men,the defense and the prosecution have rested an...,Drama
...,...,...,...
9995,Stop! Or My Mom Will Shoot,a tough police sergeant's mother comes to visi...,"Action, Comedy"
9996,Look Who's Talking Now!,when high-powered executive samantha lebon hat...,"Romance, Comedy, Family"
9997,Demonic,a police officer and a psychologist investigat...,"Thriller, Horror"
9998,The Pyramid,an archaeological team attempt to unlock the s...,Horror


### Removing URLs

In [52]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [53]:
df_final['description'] = df_final['description'].apply(remove_url)

### Removing punctuations

In [55]:
import string
exclude = string.punctuation

def remove_punc_optimized(text):
    return text.translate(str.maketrans('','',exclude))

df_final['description'] = df_final['description'].apply(remove_punc_optimized)

### Tokenization plus stop word removal


In [58]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))

def tokenize_and_remove_stopwords(text):
    if not isinstance(text, str):
        return []

    tokens = word_tokenize(text)
    return [w for w in tokens if w not in stop_words]

df_final["tokens"] = df_final["description"].apply(tokenize_and_remove_stopwords)


In [59]:
df_final

Unnamed: 0,Name,description,genre,tokens
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"Drama, Crime","[imprisoned, 1940s, double, murder, wife, love..."
1,The Godfather,spanning the years 1945 to 1955 a chronicle of...,"Drama, Crime","[spanning, years, 1945, 1955, chronicle, ficti..."
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"Drama, Crime","[continuing, saga, corleone, crime, family, yo..."
3,Schindler's List,the true story of how businessman oskar schind...,"Drama, History, War","[true, story, businessman, oskar, schindler, s..."
4,12 Angry Men,the defense and the prosecution have rested an...,Drama,"[defense, prosecution, rested, jury, filing, j..."
...,...,...,...,...
9995,Stop! Or My Mom Will Shoot,a tough police sergeants mother comes to visit...,"Action, Comedy","[tough, police, sergeants, mother, comes, visi..."
9996,Look Who's Talking Now!,when highpowered executive samantha lebon hatc...,"Romance, Comedy, Family","[highpowered, executive, samantha, lebon, hatc..."
9997,Demonic,a police officer and a psychologist investigat...,"Thriller, Horror","[police, officer, psychologist, investigate, d..."
9998,The Pyramid,an archaeological team attempt to unlock the s...,Horror,"[archaeological, team, attempt, unlock, secret..."


### Lemmatization

In [62]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def lemmatize(text):
    if not isinstance(text, str):
        return text

    doc = nlp(text)
    return " ".join(token.lemma_ for token in doc if not token.is_stop)

df_final["lemmatized_overview"] = df_final["description"].apply(lemmatize)

df_final

Unnamed: 0,Name,description,genre,tokens,lemmatized_overview
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"Drama, Crime","[imprisoned, 1940s, double, murder, wife, love...",imprison 1940 double murder wife lover upstand...
1,The Godfather,spanning the years 1945 to 1955 a chronicle of...,"Drama, Crime","[spanning, years, 1945, 1955, chronicle, ficti...",span year 1945 1955 chronicle fictional italia...
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"Drama, Crime","[continuing, saga, corleone, crime, family, yo...",continue saga corleone crime family young vito...
3,Schindler's List,the true story of how businessman oskar schind...,"Drama, History, War","[true, story, businessman, oskar, schindler, s...",true story businessman oskar schindler save th...
4,12 Angry Men,the defense and the prosecution have rested an...,Drama,"[defense, prosecution, rested, jury, filing, j...",defense prosecution rest jury file jury room d...
...,...,...,...,...,...
9995,Stop! Or My Mom Will Shoot,a tough police sergeants mother comes to visit...,"Action, Comedy","[tough, police, sergeants, mother, comes, visi...",tough police sergeant mother come visit prompt...
9996,Look Who's Talking Now!,when highpowered executive samantha lebon hatc...,"Romance, Comedy, Family","[highpowered, executive, samantha, lebon, hatc...",highpowered executive samantha lebon hatch sch...
9997,Demonic,a police officer and a psychologist investigat...,"Thriller, Horror","[police, officer, psychologist, investigate, d...",police officer psychologist investigate death ...
9998,The Pyramid,an archaeological team attempt to unlock the s...,Horror,"[archaeological, team, attempt, unlock, secret...",archaeological team attempt unlock secret lost...
