In [1]:
import pandas as pd


In [2]:
file_path = 'imdb_datasets/title.basics.tsv'

df = pd.read_csv(file_path, sep='\t', low_memory=False)

df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
# titleType의 종류
title_types = df['titleType'].unique()
print("Title Types:")
print(title_types)

# primaryTitle과 originalTitle이 다른 행 필터링
different_titles = df[df['primaryTitle'] != df['originalTitle']]

# primaryTitle과 originalTitle이 다른 행의 개수
num_different_titles = different_titles.shape[0]
print(f"\nNumber of titles where primaryTitle and originalTitle are different: {num_different_titles}")

# primaryTitle과 originalTitle이 다른 행의 예시
print("\nExamples where primaryTitle and originalTitle are different:")
print(different_titles[['primaryTitle', 'originalTitle']].head())

# genres 컬럼에서 고유한 장르 추출
genres_list = df['genres'].dropna().str.split(',').explode().unique()
print("\nGenres:")
print(genres_list)


Title Types:
['short' 'movie' 'tvShort' 'tvMovie' 'tvSeries' 'tvEpisode' 'tvMiniSeries'
 'tvSpecial' 'video' 'videoGame' 'tvPilot']

Number of titles where primaryTitle and originalTitle are different: 154010

Examples where primaryTitle and originalTitle are different:
                                   primaryTitle  \
9                           Leaving the Factory   
11                       The Arrival of a Train   
12  The Photographical Congress Arrives in Lyon   
13                          The Waterer Watered   
15                        Boat Leaving the Port   

                                        originalTitle  
9                 La sortie de l'usine Lumière à Lyon  
11                   L'arrivée d'un train à La Ciotat  
12  Le débarquement du congrès de photographie à Lyon  
13                                  L'arroseur arrosé  
15                             Barque sortant du port  

Genres:
['Documentary' 'Short' 'Animation' 'Comedy' 'Romance' 'Sport' 'News'
 'Drama'

In [16]:
### Let's delete Genre with \\n, and we may just use primaryTitle, just for analysis. 
### We may use genres and titleType for generating hashtag. 

# \\N 값을 가지는 행 제거 및 필요한 컬럼만 남기기
df_cleaned = df[df['genres'] != '\\N'][['primaryTitle', 'genres', 'titleType']]
# NaN 값을 제거
df_cleaned = df_cleaned.dropna(subset=['genres'])
# 해시태그 생성 함수
def generate_hashtags(row):
    hashtags = []
    # titleType 해시태그
    hashtags.append(f"#{row['titleType'].replace(' ', '').lower()}")
    # genres 해시태그
    genres = row['genres'].split(',')

    for genre in genres:
        if genre in hashtags:
            continue
        hashtags.append(f"#{genre.replace(' ', '').lower()}")
    return ' '.join(hashtags)

# 해시태그 컬럼 추가
df_cleaned['hashtags'] = df_cleaned.apply(generate_hashtags, axis=1)
df_cleaned.head()

Unnamed: 0,primaryTitle,genres,titleType,hashtags
0,Carmencita,"Documentary,Short",short,#short #documentary #short
1,Le clown et ses chiens,"Animation,Short",short,#short #animation #short
2,Pauvre Pierrot,"Animation,Comedy,Romance",short,#short #animation #comedy #romance
3,Un bon bock,"Animation,Short",short,#short #animation #short
4,Blacksmith Scene,"Comedy,Short",short,#short #comedy #short
