In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import json

In [2]:
df = pd.read_csv("movies.csv")

In [4]:
df.describe

<bound method NDFrame.describe of        movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62418   209157                           We (2018)   
62419   209159           Window of the Soul (2001)   
62420   209163                    Bad Poems (2018)   
62421   209169                 A Girl Thing (2001)   
62422   209171      Women of Devil's Island (1962)   

                                            genres  \
0      Adventure|Animation|Children|Comedy|Fantasy   
1                       Adventure|Children|Fantasy   
2                                   Comedy|Romance   
3                             Comedy|Drama|Romance   
4                                           Com

In [None]:
movie_list=list(df['title'].unique())

In [3]:
# Remove year from title using regex
df['title_clean'] = df['title'].apply(lambda x: re.sub(r'\s*\(\d{4}\)\s*$', '', x))

# Create dictionary with actual movieId
movies_dict = dict(zip(df['movieId'], df['title_clean']))

# Export to JSON
with open('movies_1.json', 'w', encoding='utf-8') as f:
    json.dump(movies_dict, f, indent=2, ensure_ascii=False)

print(f"JSON file created successfully with {len(movies_dict)} movies!")

JSON file created successfully with 62423 movies!


In [None]:

movies = pd.read_csv("movies.csv")
# 2. Split title into name & year

movies["title_year"] = (
    movies["title"]
    .str.extract(r"\((\d{4})\)")
    .astype(float)
)

movies["title_name"] = (
    movies["title"]
    .str.replace(r"\s*\(\d{4}\)", "", regex=True)
)

# 3. Normalize title_year

min_year = movies["title_year"].min()
max_year = movies["title_year"].max()

movies["title_year_norm"] = (
    (movies["title_year"] - min_year) / (max_year - min_year)
)


# Multi-hot encode genres
genres_split = movies["genres"].str.get_dummies(sep="|")

#final encoded dataset
movies_encoded = pd.concat(
    [
        movies[["movieId", "title_year_norm"]],
        genres_split
    ],
    axis=1
)


# 6. Export dataset
movies_encoded.to_csv("movies_encoded.csv", index=False)

print("âœ… movies_encoded dataset exported successfully!")


In [19]:
print(movies_encoded.head(15))

    movieId  title_year_norm  (no genres listed)  Action  Adventure  \
0         1         0.834483                   0       0          1   
1         2         0.834483                   0       0          1   
2         3         0.834483                   0       0          0   
3         4         0.834483                   0       0          0   
4         5         0.834483                   0       0          0   
5         6         0.834483                   0       1          0   
6         7         0.834483                   0       0          0   
7         8         0.834483                   0       0          1   
8         9         0.834483                   0       1          0   
9        10         0.834483                   0       1          1   
10       11         0.834483                   0       0          0   
11       12         0.834483                   0       0          0   
12       13         0.834483                   0       0          1   
13    