In [1]:
import pandas as pd
import ast
import os
import boto3
import sagemaker
import numpy as np

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
# Define file paths for the data files
movies_metadata_path = './movies_metadata.csv'
credits_path = './credits.csv'
keywords_path = './keywords.csv'
ratings_path = './ratings_small.csv'

In [3]:
# Load the data into DataFrames
movies_metadata_df = pd.read_csv(movies_metadata_path)
credits_df = pd.read_csv(credits_path)
keywords_df = pd.read_csv(keywords_path)
ratings_df = pd.read_csv(ratings_path)

  movies_metadata_df = pd.read_csv(movies_metadata_path)


In [4]:
# Check the loaded data
print("movies_metadata_df:")
print(movies_metadata_df.head())
print("\ncredits_df:")
print(credits_df.head())
print("\nkeywords_df:")
print(keywords_df.head())
print("\nratings_df:")
print(ratings_df.head())

movies_metadata_df:
   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844

In [5]:
# Standardize the data type of the 'id' column for data merging
credits_df['id'] = pd.to_numeric(credits_df['id'], errors='coerce')
keywords_df['id'] = pd.to_numeric(keywords_df['id'], errors='coerce')
movies_metadata_df['id'] = pd.to_numeric(movies_metadata_df['id'], errors='coerce')

In [6]:
# Merge the data frames
merged_df = pd.merge(movies_metadata_df, credits_df, on='id', how='left')
merged_df = pd.merge(merged_df, keywords_df, on='id', how='left')
print("\nMerged DataFrame:")
print(merged_df.head())


Merged DataFrame:
   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage       id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862.0  tt0114709                en   
1                                   NaN   8

In [7]:
# Modify data types and handle missing values
merged_df['budget'] = pd.to_numeric(merged_df['budget'], errors='coerce')
merged_df['popularity'] = pd.to_numeric(merged_df['popularity'], errors='coerce')
merged_df['adult'] = merged_df['adult'].astype('bool')
merged_df['video'] = merged_df['video'].astype('bool')
merged_df = merged_df.dropna(subset=['id'])
merged_df['id'] = merged_df['id'].astype('int')

numeric_columns = merged_df.select_dtypes(include=['float64', 'int64']).columns
string_columns = merged_df.select_dtypes(include=['object']).columns

merged_df[numeric_columns] = merged_df[numeric_columns].fillna(merged_df[numeric_columns].median())
merged_df[string_columns] = merged_df[string_columns].fillna('')
print("\nModified DataFrame:")
print(merged_df.head())


Modified DataFrame:
   adult                              belongs_to_collection      budget  \
0   True  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000.0   
1   True                                                     65000000.0   
2   True  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0.0   
3   True                                                     16000000.0   
4   True  {'id': 96871, 'name': 'Father of the Bride Col...         0.0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                

In [8]:
# Parse JSON-formatted columns
def safe_parse_json_column(column, key):
    def parse(row):
        try:
            data = ast.literal_eval(row)
            if isinstance(data, list):
                return [i[key] for i in data]
            return []
        except (ValueError, SyntaxError):
            return []
    return column.apply(parse)

merged_df['genres'] = safe_parse_json_column(merged_df['genres'], 'name')
merged_df['cast'] = safe_parse_json_column(merged_df['cast'], 'name')
merged_df['crew'] = safe_parse_json_column(merged_df['crew'], 'name')
merged_df['keywords'] = safe_parse_json_column(merged_df['keywords'], 'name')
print("\nParsed Columns:")
print(merged_df[['genres', 'cast', 'crew', 'keywords']].head())


Parsed Columns:
                         genres  \
0   [Animation, Comedy, Family]   
1  [Adventure, Fantasy, Family]   
2             [Romance, Comedy]   
3      [Comedy, Drama, Romance]   
4                      [Comedy]   

                                                cast  \
0  [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...   
1  [Robin Williams, Jonathan Hyde, Kirsten Dunst,...   
2  [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...   
3  [Whitney Houston, Angela Bassett, Loretta Devi...   
4  [Steve Martin, Diane Keaton, Martin Short, Kim...   

                                                crew  \
0  [John Lasseter, Joss Whedon, Andrew Stanton, J...   
1  [Larry J. Franco, Jonathan Hensleigh, James Ho...   
2  [Howard Deutch, Mark Steven Johnson, Mark Stev...   
3  [Forest Whitaker, Ronald Bass, Ronald Bass, Ez...   
4  [Alan Silvestri, Elliot Davis, Nancy Meyers, N...   

                                            keywords  
0  [jealousy, toy, boy, friendship, friends

In [9]:
# Extract top 3 actors
merged_df['top_3_actors'] = merged_df['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)
print("\nExtracted Top 3 Actors:")
print(merged_df[['title', 'top_3_actors']].head())


Extracted Top 3 Actors:
                         title  \
0                    Toy Story   
1                      Jumanji   
2             Grumpier Old Men   
3            Waiting to Exhale   
4  Father of the Bride Part II   

                                        top_3_actors  
0                [Tom Hanks, Tim Allen, Don Rickles]  
1     [Robin Williams, Jonathan Hyde, Kirsten Dunst]  
2         [Walter Matthau, Jack Lemmon, Ann-Margret]  
3  [Whitney Houston, Angela Bassett, Loretta Devine]  
4         [Steve Martin, Diane Keaton, Martin Short]  


In [10]:
# Define a function to extract director information from the 'crew' column and apply it
def extract_director_from_crew(crew_json):
    crew_data = ast.literal_eval(crew_json)
    for crew_member in crew_data:
        if crew_member.get('job') == 'Director':
            return crew_member.get('name')
    return None

credits_df['director'] = credits_df['crew'].apply(extract_director_from_crew)

In [11]:
# Add director information to the merged DataFrame
merged_df = pd.merge(merged_df, credits_df[['id', 'director']], on='id', how='left')
print("Merged DataFrame (Director Info added):")
print(merged_df.head())

Merged DataFrame (Director Info added):
   adult                              belongs_to_collection      budget  \
0   True  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000.0   
1   True                                                     65000000.0   
2   True  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0.0   
3   True                                                     16000000.0   
4   True  {'id': 96871, 'name': 'Father of the Bride Col...         0.0   

                         genres                              homepage     id  \
0   [Animation, Comedy, Family]  http://toystory.disney.com/toy-story    862   
1  [Adventure, Fantasy, Family]                                         8844   
2             [Romance, Comedy]                                        15602   
3      [Comedy, Drama, Romance]                                        31357   
4                      [Comedy]                                        11862   

     imdb_id original_langua

In [12]:
# Print extracted director information
print("\nExtracted Director Information:")
print(merged_df[['title', 'director']].head())


Extracted Director Information:
                         title         director
0                    Toy Story    John Lasseter
1                      Jumanji     Joe Johnston
2             Grumpier Old Men    Howard Deutch
3            Waiting to Exhale  Forest Whitaker
4  Father of the Bride Part II    Charles Shyer


In [13]:
#sagemaker code
# get current session region
session = boto3.session.Session()
region = session.region_name
print(f'currently in {region}')

currently in us-east-2


In [14]:
# 처리된 데이터를 저장하기 위해서 기본 sagemaker s3 버킷을 사용합니다. 
# 여기서는 기본 버킷 이름이 다음의 코드가 나타냅니다. 
sagemaker_session = sagemaker.Session()
bucket_name = 'sagemaker-gacheon-003'
print(bucket_name)  
# bucket name format: "sagemaker-gacheon-{숫자}"
%store bucket_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker-gacheon-003
Stored 'bucket_name' (str)


In [16]:
# save data locally first
dest = 'ML_test'
movies_csv_path = os.path.join(dest, 'preprocessed_movies.csv')
credits_csv_path = os.path.join(dest, 'preprocessed_credits.csv')
small_ratings_csv_path = os.path.join('ratings_small.csv')
origin_ratings_csv_path = os.path.join('ratings.csv')
origin_movies_csv_path = os.path.join('movies.csv')

# Ensure the destination directory exists
if not os.path.exists(dest):
    os.makedirs(dest)

# Save the processed DataFrames to CSV files locally
merged_df.to_csv(movies_csv_path, index=False)
credits_df.to_csv(credits_csv_path, index=False)

# Upload the CSV files to your S3 bucket
sagemaker_session.upload_data(movies_csv_path, bucket=bucket_name, key_prefix='data')
sagemaker_session.upload_data(credits_csv_path, bucket=bucket_name, key_prefix='data')
sagemaker_session.upload_data(small_ratings_csv_path, bucket=bucket_name, key_prefix='data')
sagemaker_session.upload_data(origin_ratings_csv_path, bucket=bucket_name, key_prefix='data')
sagemaker_session.upload_data(origin_movies_csv_path, bucket=bucket_name, key_prefix='data')




's3://sagemaker-gacheon-003/data/movies.csv'