## Imports

In [1]:
import pandas as pd
import re
import ast
from collections import Counter
import itertools

## Dataset Overview

In [2]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Plot Kyeword,Director,Top 5 Casts,Writer,year,path
0,Top Gun: Maverick,"$170,000,000 (estimated)",8.6,187K,"['Action', 'Drama']",After more than thirty years of service as one...,"['fighter jet', 'sequel', 'u.s. navy', 'fighte...",Joseph Kosinski,"['Jack Epps Jr.', 'Peter Craig', 'Tom Cruise',...",Jim Cash,-2022,/title/tt1745960/
1,Jurassic World Dominion,2 hours 27 minutes,6.0,56K,"['Action', 'Adventure', 'Sci-Fi']",Four years after the destruction of Isla Nubla...,"['dinosaur', 'jurassic park', 'tyrannosaurus r...",Colin Trevorrow,"['Colin Trevorrow', 'Derek Connolly', 'Chris P...",Emily Carmichael,-2022,/title/tt8041270/
2,Top Gun,"$15,000,000 (estimated)",6.9,380K,"['Action', 'Drama']",As students at the United States Navy's elite ...,"['pilot', 'male camaraderie', 'u.s. navy', 'gr...",Tony Scott,"['Jack Epps Jr.', 'Ehud Yonay', 'Tom Cruise', ...",Jim Cash,-1986,/title/tt0092099/
3,Lightyear,"$71,101,257",5.2,32K,"['Animation', 'Action', 'Adventure']",While spending years attempting to return home...,"['galaxy', 'spaceship', 'robot', 'rocket', 'sp...",Angus MacLane,"['Jason Headley', 'Matthew Aldrich', 'Chris Ev...",Angus MacLane,-2022,/title/tt10298810/
4,Spiderhead,not-released,5.4,23K,"['Action', 'Crime', 'Drama']","In the near future, convicts are offered the c...","['discover', 'medical', 'test', 'reality', 'fi...",Joseph Kosinski,"['Rhett Reese', 'Paul Wernick', 'Chris Hemswor...",George Saunders,-2022,/title/tt9783600/


In [3]:
df.describe().T

Unnamed: 0,count,unique,top,freq
movie title,24402,23922,Rage,4
Run Time,24402,1556,not-released,8475
Rating,24402,91,no-rating,1740
User Rating,24402,1684,0,1740
Generes,24402,746,['Drama'],943
Overview,24158,23957,none,142
Plot Kyeword,24402,21546,[],1696
Director,24402,11604,See company contact information,142
Top 5 Casts,24402,24211,"['See producer', 'See preliminary cast']",142
Writer,24402,15562,See writer,142


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24402 entries, 0 to 24401
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movie title   24402 non-null  object
 1   Run Time      24402 non-null  object
 2   Rating        24402 non-null  object
 3   User Rating   24402 non-null  object
 4   Generes       24402 non-null  object
 5   Overview      24158 non-null  object
 6   Plot Kyeword  24402 non-null  object
 7   Director      24402 non-null  object
 8   Top 5 Casts   24402 non-null  object
 9   Writer        24402 non-null  object
 10  year          23624 non-null  object
 11  path          24402 non-null  object
dtypes: object(12)
memory usage: 2.2+ MB


In [5]:
df.shape

(24402, 12)

In [6]:
df.isnull().sum()

movie title       0
Run Time          0
Rating            0
User Rating       0
Generes           0
Overview        244
Plot Kyeword      0
Director          0
Top 5 Casts       0
Writer            0
year            778
path              0
dtype: int64

In [7]:
# Fixing typo
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df = df.rename(columns={'plot_kyeword': 'plot_keyword', 'generes': 'genres'})

## Dropping Unnecessary columns

In [8]:
# We are not using run time, year, and path to recommend, we should drop the columns.
df = df.drop('run_time', axis=1)
df = df.drop('year', axis=1)
df = df.drop('path', axis=1)
df = df.drop('overview', axis=1)

## High quality Filtering

In [9]:
high_quality_df = df[(df['rating'] >= 6.0) & (df['votes'] >= 10000)]

TypeError: '>=' not supported between instances of 'str' and 'float'

## Rating and Votes Parsing

In [None]:
# Change rating to numbers
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

In [None]:
def parse_votes(value):
    if isinstance(value, str):
        value = value.strip().upper().replace('K', '000').replace('M', '000000')
    try:
        return int(float(value))
    except:
        return None

df['votes'] = df['user_rating'].apply(parse_votes)

## Strings to Lists

In [None]:
for col in ['genres', 'plot_keyword', 'top_5_casts']:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

## Genre Cleaning

In [None]:
# Genres Observation
genre_counter = Counter(itertools.chain.from_iterable(high_quality_df['genres']))
print(genre_counter.most_common())

### One-Hot Encoding

In [None]:
# One-Hot Encode Genres
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(high_quality_df['genres'])

genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

high_quality_df = pd.concat([high_quality_df.reset_index(drop=True), genre_df], axis=1)

## Keywords Cleaning

In [None]:
# Keyword Observation
def normalize_keywords(keywords):
    keywords = [kw.lower().strip() for kw in keywords if isinstance(kw, str)]
    return list(set(keywords))

high_quality_df['plot_keyword'] = high_quality_df['plot_keyword'].apply(normalize_keywords)

keyword_counter = Counter(itertools.chain.from_iterable(high_quality_df['plot_keyword']))
print(keyword_counter.most_common())

### Filtering Out Low Frequency Keywords

In [None]:
threshold = 3
low_freq_keywords = {kw for kw, count in keyword_counter.items() if count <= threshold}
len(low_freq_keywords)

In [None]:
def remove_low_freq_keywords(keywords):
    return [kw for kw in keywords if kw not in low_freq_keywords]

high_quality_df['plot_keyword'] = high_quality_df['plot_keyword'].apply(remove_low_freq_keywords)

### Lemmatize keywords

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
lemm = WordNetLemmatizer()

def lemmatize_keywords(keywords):
    return [lemm.lemmatize(kw) for kw in keywords]

high_quality_df['plot_keyword'] = high_quality_df['plot_keyword'].apply(lemmatize_keywords)

In [None]:
high_quality_df.info()

In [None]:
high_quality_df.isnull().sum()

In [None]:
high_quality_df.to_csv('cleaned_high_quality_dataset.csv', index=False)