In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.min_rows', 100)


In [2]:
# 파일 읽어오기
personality_data = '../dataset/personality-isf2018/personality-data.csv'
movies_data = '../dataset/ml-32m/movies.csv'
links_data = '../dataset/ml-32m/links.csv'

personality_df = pd.read_csv(personality_data)
movies_df = pd.read_csv(movies_data)
links_df = pd.read_csv(links_data)

# 불순한 column명들 깔끔히 정리
personality_df.columns = personality_df.columns.str.strip()
movies_df.columns = movies_df.columns.str.strip()
links_df.columns = links_df.columns.str.strip()
# 결측치 제거
personality_df.dropna(inplace=True)
movies_df.dropna(inplace=True)
links_df.dropna(inplace=True)

print(personality_df.head())
print(movies_df.head())
print(links_df.head())


                             userid  openness  agreeableness  emotional_stability  conscientiousness  extraversion assigned metric assigned condition  movie_1  predicted_rating_1  movie_2  predicted_rating_2  movie_3  predicted_rating_3  movie_4  predicted_rating_4  movie_5  predicted_rating_5  movie_6  predicted_rating_6  movie_7  predicted_rating_7  movie_8  predicted_rating_8  movie_9  predicted_rating_9  movie_10  predicted_rating_10  movie_11  predicted_rating_11  movie_12  predicted_rating_12  is_personalized  enjoy_watching
0  8e7cebf9a234c064b75016249f2ac65e       5.0            2.0                  3.0                2.5           6.5     serendipity               high    77658            4.410466    95858            4.271995   115713            4.611922    26674            4.459407    93040            4.147292   117533            4.098206   108979            4.064843   112582            4.149100   120138            4.244817    121372             4.396004    127152            

In [3]:
## 데이터 타입 변환작업

# links.csv에 있던 tmbdId들 int처리
links_df['tmdbId'] = links_df['tmdbId'].astype(int)

# movies.csv에 있던 genres들 정리
movies_df['genres'] = movies_df['genres'].str.split('|')


# personality 특성들 정규화 (1-7 scale)
traits = ['openness', 'agreeableness', 'emotional_stability', 'conscientiousness', 'extraversion']

for trait in traits:
    min_val = 1
    max_val = 7
    personality_df[trait] = (personality_df[trait] - min_val) / (max_val - min_val)

print("\n정규화된 personality 데이터:")
print(personality_df[traits].head())


정규화된 personality 데이터:
   openness  agreeableness  emotional_stability  conscientiousness  extraversion
0  0.666667       0.166667             0.333333           0.250000      0.916667
1  1.000000       0.500000             0.833333           0.750000      0.500000
2  0.500000       0.333333             0.583333           0.166667      0.250000
3  0.750000       0.750000             0.500000           0.583333      0.500000
4  0.750000       0.750000             0.416667           0.583333      0.250000


In [4]:
## 영화 컬럼들(movie_1~12)과 예측평점을 긴 형태(long format)로 변환

movie_columns = [f'movie_{i}' for i in range(1, 13)] # 길게 늘릴 칼럼
rating_columns = [f'predicted_rating_{i}' for i in range(1, 13)] # 제거할 칼럼
# rating_columns 제거
personality_df.drop(columns=rating_columns, inplace=True)
# melt를 사용하여 영화 ID와 평점을 각각의 행으로 변환
melted_df = pd.melt(personality_df,
                    id_vars=personality_df.columns.difference(movie_columns),
                    var_name='movie_number',
                    value_name='movieId')
print('melted_df\n', melted_df.head(24), melted_df.tail(24))
display(melted_df.shape)    

# 예측 평점도 같은 방식으로 변환

## 변환된 데이터프레임 join
# melted_df['userid'] = ratings_melted['userid']

# 유저별로 정렬
sorted_df = melted_df.sort_values(by=['userid', 'movie_number'])
print('sorted_df\n', sorted_df.head(24))


melted_df
     agreeableness assigned condition assigned metric  conscientiousness  emotional_stability  enjoy_watching  extraversion  is_personalized  openness                            userid movie_number  movieId
0        0.166667               high     serendipity           0.250000             0.333333               4      0.916667                4  0.666667  8e7cebf9a234c064b75016249f2ac65e      movie_1    77658
1        0.500000            default             all           0.750000             0.833333               3      0.500000                2  1.000000  77c7d756a093150d4377720abeaeef76      movie_1    94959
2        0.333333             medium     serendipity           0.166667             0.583333               2      0.250000                2  0.500000  b7e8a92987a530cc368719a0e60e26a3      movie_1   110501
3        0.750000             medium      popularity           0.583333             0.500000               3      0.500000                3  0.750000  92561f21446e01

(22008, 12)

sorted_df
        agreeableness assigned condition assigned metric  conscientiousness  emotional_stability  enjoy_watching  extraversion  is_personalized  openness                            userid movie_number  movieId
563         0.833333                low      popularity           0.583333             0.166667               3           0.0                1  0.750000  005fe8678214011d7f92e51f9a546d40      movie_1    94466
17069       0.833333                low      popularity           0.583333             0.166667               3           0.0                1  0.750000  005fe8678214011d7f92e51f9a546d40     movie_10     3435
18903       0.833333                low      popularity           0.583333             0.166667               3           0.0                1  0.750000  005fe8678214011d7f92e51f9a546d40     movie_11    44555
20737       0.833333                low      popularity           0.583333             0.166667               3           0.0                1  0.750000 

In [5]:
# movies_df와 links_df를 movieId를 기준으로 movie_info에 병합
movie_info = pd.merge(movies_df, links_df, on='movieId')

# 최종적으로 personality 데이터와 영화 정보를 병합
final_df = pd.merge(melted_df, movie_info, on='movieId')

print('final_df\n', final_df.head(24), final_df.isnull().sum())
display(final_df.shape)

## 개별 장르로 분리

exploded_df = final_df.explode('genres')
cleaned_df = exploded_df[exploded_df['genres'] != '(no genres listed)']

print('cleaned_df\n', cleaned_df.head(24), cleaned_df.isnull().sum())
display(cleaned_df.shape)

print("모든 컬럼 목록:")
print(cleaned_df.columns.tolist())



final_df
     agreeableness assigned condition assigned metric  conscientiousness  emotional_stability  enjoy_watching  extraversion  is_personalized  openness                            userid movie_number  movieId                                              title                                             genres   imdbId  tmdbId
0        0.500000            default             all           0.750000             0.833333               3      0.500000                2  1.000000  77c7d756a093150d4377720abeaeef76      movie_1    94959                            Moonrise Kingdom (2012)                           [Comedy, Drama, Romance]  1748122   83666
1        0.333333             medium     serendipity           0.166667             0.583333               2      0.250000                2  0.500000  b7e8a92987a530cc368719a0e60e26a3      movie_1   110501                        The Raid 2: Berandal (2014)                          [Action, Crime, Thriller]  2265171  180299
2        0.7500

(19464, 16)

cleaned_df
     agreeableness assigned condition assigned metric  conscientiousness  emotional_stability  enjoy_watching  extraversion  is_personalized  openness                            userid movie_number  movieId                                              title       genres   imdbId  tmdbId
0        0.500000            default             all           0.750000             0.833333               3      0.500000                2  1.000000  77c7d756a093150d4377720abeaeef76      movie_1    94959                            Moonrise Kingdom (2012)       Comedy  1748122   83666
0        0.500000            default             all           0.750000             0.833333               3      0.500000                2  1.000000  77c7d756a093150d4377720abeaeef76      movie_1    94959                            Moonrise Kingdom (2012)        Drama  1748122   83666
0        0.500000            default             all           0.750000             0.833333               3      0.500000     

(50457, 16)

모든 컬럼 목록:
['agreeableness', 'assigned condition', 'assigned metric', 'conscientiousness', 'emotional_stability', 'enjoy_watching', 'extraversion', 'is_personalized', 'openness', 'userid', 'movie_number', 'movieId', 'title', 'genres', 'imdbId', 'tmdbId']


In [6]:
# 1. 장르별 출현 빈도 확인
genre_counts = cleaned_df['genres'].value_counts()
print("장르별 출현 빈도:")
print(genre_counts)
# 출현 빈도 낮은 장르들 others로 따로 분류
total_count = len(cleaned_df)
min_percentage = 0.01  # 5%

genre_counts = cleaned_df['genres'].value_counts()
rare_genres = genre_counts[genre_counts/total_count < min_percentage].index

cleaned_df['genres'] = cleaned_df['genres'].apply(
    lambda x: 'Other' if x in rare_genres else x
)

genre_count = cleaned_df['genres'].value_counts()
print("장르별 출현 빈도:")
print(genre_count)



장르별 출현 빈도:
genres
Drama          10901
Comedy          4816
Thriller        4328
Action          4054
Crime           3867
Adventure       3649
Romance         2959
Mystery         2296
Sci-Fi          2263
Fantasy         2099
Animation       2025
War             1659
Documentary     1412
Children        1326
IMAX            1032
Horror           533
Film-Noir        503
Musical          405
Western          319
Drama             11
Name: count, dtype: int64
장르별 출현 빈도:
genres
Drama          10901
Comedy          4816
Thriller        4328
Action          4054
Crime           3867
Adventure       3649
Romance         2959
Mystery         2296
Sci-Fi          2263
Fantasy         2099
Animation       2025
War             1659
Documentary     1412
Children        1326
Other           1238
IMAX            1032
Horror           533
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['genres'] = cleaned_df['genres'].apply(


In [158]:
print('현재 장르 분포:')
print(cleaned_df['genres'].value_counts())

encoder = LabelEncoder()
cleaned_df['genre_encoded'] = encoder.fit_transform(cleaned_df['genres'])

# 3. 인코딩 결과 확인
# 각 장르가 어떤 숫자로 매핑되었는지 확인
genre_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("\n장르-숫자 매핑:")
for genre, num in sorted(genre_mapping.items()):
    print(f"{genre}: {num}")

# 4. 결과 데이터 확인
print("\n인코딩 결과 (처음 10개):")
print(cleaned_df[['genres', 'genre_encoded']].head(10))

# # 5. 원래 장르로 되돌리기 위한 매핑 저장 (필요시)
# inverse_mapping = dict(zip(le.transform(le.classes_), le.classes_))


현재 장르 분포:
genres
Drama          10901
Comedy          4816
Thriller        4328
Action          4054
Crime           3867
Adventure       3649
Romance         2959
Mystery         2296
Sci-Fi          2263
Fantasy         2099
Animation       2025
War             1659
Documentary     1412
Children        1326
Other           1238
IMAX            1032
Horror           533
Name: count, dtype: int64

장르-숫자 매핑:
Action: 0
Adventure: 1
Animation: 2
Children: 3
Comedy: 4
Crime: 5
Documentary: 6
Drama: 7
Fantasy: 8
Horror: 9
IMAX: 10
Mystery: 11
Other: 12
Romance: 13
Sci-Fi: 14
Thriller: 15
War: 16

인코딩 결과 (처음 10개):
      genres  genre_encoded
0     Comedy              4
0      Drama              7
0    Romance             13
1     Action              0
1      Crime              5
1   Thriller             15
2     Action              0
2  Adventure              1
2      Drama              7
3     Action              0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['genre_encoded'] = encoder.fit_transform(cleaned_df['genres'])


In [159]:
## 최종 확인 주우웅...
print(cleaned_df.isnull().sum())
print(cleaned_df[['imdbId', 'tmdbId', 'genre_encoded']].dtypes)
print(cleaned_df.head(100))
cleaned_df = cleaned_df.sort_values(by=['userid', 'movie_number'])
print(cleaned_df.columns, '\n', cleaned_df.head(100))


agreeableness          0
assigned condition     0
assigned metric        0
conscientiousness      0
emotional_stability    0
enjoy_watching         0
extraversion           0
is_personalized        0
openness               0
userid                 0
movie_number           0
movieId                0
title                  0
genres                 0
imdbId                 0
tmdbId                 0
genre_encoded          0
dtype: int64
imdbId           int64
tmdbId           int32
genre_encoded    int32
dtype: object
    agreeableness assigned condition assigned metric  conscientiousness  emotional_stability  enjoy_watching  extraversion  is_personalized  openness                            userid movie_number  movieId                                          title     genres   imdbId  tmdbId  genre_encoded
0        0.500000            default             all           0.750000             0.833333               3      0.500000                2  1.000000  77c7d756a093150d4377720abeaeef76

In [160]:
## 컬럼 순서 변경
# 원하는 순서대로 컬럼 나열
sorted_cols = ['userid', 'openness', 'conscientiousness', 'extraversion', 'agreeableness', 'emotional_stability', 'title', 'genres', 'genre_encoded', 'is_personalized', 'enjoy_watching', 'movieId', 'tmdbId', 'imdbId']
desired_cols = sorted_cols + [col for col in cleaned_df.columns if col not in sorted_cols]
print(desired_cols)


# 순서 변경
cleaned_df = cleaned_df[desired_cols]
print(cleaned_df.columns, '\n', cleaned_df.head(100))

['userid', 'openness', 'conscientiousness', 'extraversion', 'agreeableness', 'emotional_stability', 'title', 'genres', 'genre_encoded', 'is_personalized', 'enjoy_watching', 'movieId', 'tmdbId', 'imdbId', 'assigned condition', 'assigned metric', 'movie_number']
Index(['userid', 'openness', 'conscientiousness', 'extraversion', 'agreeableness', 'emotional_stability', 'title', 'genres', 'genre_encoded', 'is_personalized', 'enjoy_watching', 'movieId', 'tmdbId', 'imdbId', 'assigned condition', 'assigned metric', 'movie_number'], dtype='object') 
                                  userid  openness  conscientiousness  extraversion  agreeableness  emotional_stability                                              title       genres  genre_encoded  is_personalized  enjoy_watching  movieId  tmdbId   imdbId assigned condition assigned metric movie_number
14857  005fe8678214011d7f92e51f9a546d40  0.750000           0.583333      0.000000       0.833333             0.166667                            Do

In [161]:
# 1. 데이터 형태 확인
print("데이터 크기:", cleaned_df.shape)
print("\n컬럼 목록:", cleaned_df.columns.tolist())

# 2. 결측치 확인
print("\n결측치 확인:")
print(cleaned_df.isnull().sum())

# 3. 데이터 타입 확인
print("\n데이터 타입:")
print(cleaned_df.dtypes)

# 4. 샘플 데이터 확인 (처음 몇 개 행)
print("\n데이터 샘플:")
print(cleaned_df.head())

# 5. 장르 인코딩 확인
print("\n고유 장르 수:", cleaned_df['genres'].nunique())
print("장르 샘플:", cleaned_df['genres'].unique()[:5])

# 모두 확인됐다면 저장
cleaned_df.to_csv('preprocessed_data.csv', index=False)
print("\n데이터 저장 완료!")

데이터 크기: (50457, 17)

컬럼 목록: ['userid', 'openness', 'conscientiousness', 'extraversion', 'agreeableness', 'emotional_stability', 'title', 'genres', 'genre_encoded', 'is_personalized', 'enjoy_watching', 'movieId', 'tmdbId', 'imdbId', 'assigned condition', 'assigned metric', 'movie_number']

결측치 확인:
userid                 0
openness               0
conscientiousness      0
extraversion           0
agreeableness          0
emotional_stability    0
title                  0
genres                 0
genre_encoded          0
is_personalized        0
enjoy_watching         0
movieId                0
tmdbId                 0
imdbId                 0
assigned condition     0
assigned metric        0
movie_number           0
dtype: int64

데이터 타입:
userid                  object
openness               float64
conscientiousness      float64
extraversion           float64
agreeableness          float64
emotional_stability    float64
title                   object
genres                  object
genre_e