In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore")

In [2]:
# data import
main_df = pd.read_csv('./netflix_titles.csv')
main_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# 데이터 살펴보기

In [3]:
# 결측치 수
main_df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [4]:
# 유티크한 데이터 수
main_df.nunique()

show_id         8807
type               2
title           8807
director        4528
cast            7692
country          748
date_added      1767
release_year      74
rating            17
duration         220
listed_in        514
description     8775
dtype: int64

# 결측치 삭제


- 시간 변화에 따른 흐름을 보고자 한다.
- 2016년도 이후 급격히 데이터가 늘어나, 시간 순서대로 보기에는 어려움이 있다.
- 따라서, 10개 구간으로 나눈 후 변화의 추이를 보고자 한다.


- 전체 데이터 8807개를 10개 구간으로 나누기 위해서 결측치 17개를 삭제한다.
- date_added, rating, duration 3가지 항목을 삭제한다.


- country의 경우 우선은 두고, 다른 방법으로 살펴볼 수 있도록 한다.

In [5]:
# date_added 10개 행 삭제
main_df = main_df[main_df['date_added'].notna()]

# rating 4개 행 삭제
main_df = main_df[main_df['rating'].notna()]

# duration 3개 행 삭제
main_df = main_df[main_df['duration'].notna()]

main_df.isna().sum()

show_id            0
type               0
title              0
director        2621
cast             825
country          829
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

# 데이터 타입 변환

In [6]:
main_df.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

- date_added 는 datatime으로 변경한다.

In [7]:
main_df['date_added'] = main_df['date_added'].astype('datetime64')
main_df['date_added']

0      2021-09-25
1      2021-09-24
2      2021-09-24
3      2021-09-24
4      2021-09-24
          ...    
8802   2019-11-20
8803   2019-07-01
8804   2019-11-01
8805   2020-01-11
8806   2019-03-02
Name: date_added, Length: 8790, dtype: datetime64[ns]

# 데이터 병합하기

### NetflixOriginals.csv 
### 데이터 불러오기

In [8]:
original_df = pd.read_csv('./NetflixOriginals.csv')
original_df.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,5.Aug.19,58,2.5,English/Japanese
1,Dark Forces,Thriller,21.Aug.20,81,2.6,Spanish
2,The App,Science fiction/Drama,26.Dec.19,79,2.6,Italian
3,The Open House,Horror thriller,19.Jan.18,94,3.2,English
4,Kaali Khuhi,Mystery,30.Oct.20,90,3.4,Hindi


### 데이터 타입 변환

- Premiere를 datatime으로 변환한다.

In [9]:
original_df['Premiere'] = original_df['Premiere'].astype('datetime64')
original_df['Premiere']

0     2019-08-05
1     2020-08-21
2     2019-12-26
3     2018-01-19
4     2020-10-30
         ...    
579   2018-12-31
580   2015-10-09
581   2018-12-16
582   2020-12-08
583   2020-10-04
Name: Premiere, Length: 584, dtype: datetime64[ns]

### columns 삭제, 이름 변경

- main_df와 병합을 title을 기준으로 할 것이기 때문에 Title 을 title로 변경한다.
- Genre, Premiere, Runtime은 main_df와 동일한 것이기 때문에 삭제한다.

In [10]:
# 열삭제
original_df = original_df.drop(['Genre', 'Premiere', 'Runtime'], axis=1)
original_df.head()

Unnamed: 0,Title,IMDB Score,Language
0,Enter the Anime,2.5,English/Japanese
1,Dark Forces,2.6,Spanish
2,The App,2.6,Italian
3,The Open House,3.2,English
4,Kaali Khuhi,3.4,Hindi


In [11]:
# 이름 변경
original_df.columns = ['title', 'imdb score', 'language']
original_df

Unnamed: 0,title,imdb score,language
0,Enter the Anime,2.5,English/Japanese
1,Dark Forces,2.6,Spanish
2,The App,2.6,Italian
3,The Open House,3.2,English
4,Kaali Khuhi,3.4,Hindi
...,...,...,...
579,Taylor Swift: Reputation Stadium Tour,8.4,English
580,Winter on Fire: Ukraine's Fight for Freedom,8.4,English/Ukranian/Russian
581,Springsteen on Broadway,8.5,English
582,Emicida: AmarElo - It's All For Yesterday,8.6,Portuguese


### main_df 와 original_df 병합하기

In [12]:
# main_df와 original_df 교집합 데이터
onlyoriginal = pd.merge(main_df, original_df)
onlyoriginal

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,imdb score,language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",7.5,English
1,s142,Movie,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada",2021-09-01,2015,R,82 min,Action & Adventure,"When a retired CIA agent is kidnapped, his son...",6.7,English
2,s625,Movie,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",,2021-06-30,2021,TV-MA,92 min,"Dramas, International Movies, Thrillers","On New Year’s Eve 1999, an armed man enters a ...",5.7,Polish
3,s835,Movie,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,2021-05-27,2021,TV-PG,97 min,"Children & Family Movies, Dramas, Faith & Spir...","To save their cash-strapped orphanage, a guard...",6.7,English
4,s837,Movie,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,2021-05-27,2021,TV-MA,118 min,"Horror Movies, International Movies, Thrillers","After witnessing a haunting in their hospital,...",5.2,Thai
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,s5902,Movie,Tig,"Kristina Goolsby, Ashley York",Tig Notaro,United States,2015-07-17,2015,TV-14,92 min,"Documentaries, Stand-Up Comedy",Comedian Tig Notaro announced her cancer diagn...,7.4,English
499,s5907,Movie,"What Happened, Miss Simone?",Liz Garbus,,United States,2015-06-26,2015,TV-14,103 min,"Documentaries, Music & Musicals","Using never-before-heard recordings, rare arch...",7.6,English
500,s5911,Movie,Hot Girls Wanted,"Jill Bauer, Ronna Gradus",,United States,2015-05-29,2015,TV-MA,83 min,Documentaries,This 2015 Sundance Film Festival breakout docu...,6.1,English
501,s5913,Movie,The Other One: The Long Strange Trip of Bob Weir,Mike Fleiss,Bob Weir,United States,2015-05-22,2015,TV-14,84 min,"Documentaries, Music & Musicals",This chronicle of Bob Weir highlights his brot...,7.3,English


In [13]:
# onlyoriginal csv 파일로 저장
onlyoriginal.to_csv('onlyoriginal.csv')

***
### main_df 지역별로 묶기
- 대륙별 수익 데이터와 연결하기 위해 main_df의 country의 값을 대륙별로 정리한다.

In [14]:
main_df['country'].unique()

array(['United States', 'South Africa', nan, 'India',
       'United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia',
       'United Kingdom', 'Germany, Czech Republic', 'Mexico', 'Turkey',
       'Australia', 'United States, India, France', 'Finland',
       'China, Canada, United States',
       'South Africa, United States, Japan', 'Nigeria', 'Japan',
       'Spain, United States', 'France', 'Belgium',
       'United Kingdom, United States', 'United States, United Kingdom',
       'France, United States', 'South Korea', 'Spain',
       'United States, Singapore', 'United Kingdom, Australia, France',
       'United Kingdom, Australia, France, United States',
       'United States, Canada', 'Germany, United States',
       'South Africa, United States', 'United States, Mexico',
       'United States, Italy, France, Japan',
       'United States, Italy, Romania, United Kingdom',
       'Australia, United States', 'Argentina, Venezuela',
       'United States, United Kin

- 하고싶은 것: '대륙별 수익과 구독자 수 변화'와 '넷플릭스에서 대륙별로 제작하는 콘텐츠 수 변화'를 함께 비교해보고 싶다.
- country의 unique값이 너무 많고, 공동 제작한 작품이 많기 때문에 정리가 필요하다.
- country의 국가를 먼저 split(',')으로 나누고, 대륙별로 리스트에 넣은 다음, date_added와 함께 데이터프레임을 만든다.
- '대륙별 수익과 구독자 수' 데이터셋에는 년도만 기준으로 나오기 때문에 date_added를 년도만 추출한다.

In [105]:
country = [] # 나라
date_added = [] # 추가된 날
cont = [] # 대륙

us_canada = ['United States', 'Canada']
eu_me_af = ['Vatican City', 'Gibraltar', 'San Marino', 'Liechtenstein', 'Monaco',
            'Faroe Islands', 'Andorra', 'Isle of Man', 'Seychelles', 'Mayotte',
            'Iceland', 'Malta', 'Cape Verde', 'Western Sahara', 'Montenegro',
            'Luxembourg', 'Comoros', 'Reunion', 'Djibouti', 'Cyprus',
            'Mauritius', 'Estonia', 'Equatorial Guinea', 'Bahrain', 'Latvia',
            'Guinea Bissau', 'Slovenia', 'Lesotho', 'Gabon', 'Botswana',
            'Gambia', 'Namibia', 'Lithuania', 'Albania', 'Qatar',
            'Bosnia And Herzegovina', 'Eritrea', 'Georgia', 'Moldova', 'Croatia',
            'Kuwait', 'Mauritania', 'Central African Republic', 'Ireland', 'Liberia',
            'Oman', 'Palestine', 'Slovakia', 'Norway', 'Finland', 'Republic of the Congo',
            'Denmark', 'Lebanon', 'Bulgaria', 'Libya', 'Sierra Leone', 'Serbia',
            'Togo', 'Switzerland', 'Israel', 'Austria', 'Belarus',
            'Hungary', 'United Arab Emirates', 'Portugal', 'Sweden', 'Jordan',
            'Greece', 'Czech Republic', 'Belgium', 'Tunisia', 'Burundi',
            'Benin', 'Rwanda', 'Guinea', 'Zimbabwe', 'Somalia', 'Netherlands',
            'Chad', 'Senegal', 'Romania', 'Syria', 'Zambia', 'Malawi',
            'Mali', 'Burkina Faso', 'Niger', 'Ivory Coast', 'Cameroon',
            'Madagascar', 'Yemen', 'Ghana', 'Mozambique', 'Angola', 'Saudi Arabia',
            'Poland', 'Morocco', 'Iraq', 'Ukraine', 'Algeria', 'Sudan', 'Spain',
            'Uganda', 'Kenya', 'Italy', 'South Africa', 'Tanzania', 'France', 'United Kingdom',
            'Germany', 'Turkey', 'Iran', 'DR Congo', 'Egypt', 'Ethiopia', 'Russia', 'Nigeria']
la = ['Puerto Rico', 'Uruguay', 'Panama', 'Costa Rica', 'El Salvador', 'Nicaragua', 'Paraguay',
      'Honduras', 'Dominican Republic', 'Cuba', 'Bolivia', 'Ecuador', 'Guatemala', 'Chile', 'Venezuela',
      'Peru', 'Argentina', 'Colombia', 'Mexico', 'Brazil']
ap = ['Niue', 'Nauru', 'Tuvalu', 'Cook Islands', 'Palau', 'Northern Mariana Islands', 'Marshall Islands',
      'Tonga', 'Micronesia', 'Kiribati', 'Guam', 'Samoa', 'French Polynesia', 'New Caledonia', 
      'Vanuatu', 'Brunei', 'Maldives', 'Macau', 'Solomon Islands', 'Bhutan', 'Fiji', 'Timor-Leste', 'Armenia',
      'Mongolia', 'Georgia', 'New Zealand', 'Singapore', 'Turkmenistan', 'Kyrgyzstan', 'Laos',
      'Hong Kong', 'Papua New Guinea', 'Tajikistan', 'Azerbaijan', 'Cambodia', 'Kazakhstan', 'Chile', 'Sri Lanka',
      'North Korea', 'Australia', 'Nepal', 'Malaysia', 'Peru', 'Uzbekistan', 'Canada', 'Afghanistan',
      'South Korea', 'Myanmar', 'Thailand', 'Turkey', 'Iran', 'Vietnam', 'Philippines', 'Japan',
      'Mexico', 'Russia', 'Bangladesh', 'Pakistan', 'Indonesia', 'United States', 'India', 'China']

In [129]:
country = [] # 나라
date_added = [] # 추가된 날
cont = [] # 대륙

main_df['country'] = main_df['country'].fillna('Unknown')

length = len(main_df)

for x in range(0, length):
    row = main_df[x:x+1]
    countrylist = list(row['country'])
    for i in countrylist:
        i = list(i.split(','))
        i[0] = i[0].strip()
        if len(i) == 1:
            if i[0] in us_canada:
                country.append(i[0])
                date_added.append(row['date_added'])
                cont.append('United States and Canada')
            elif i[0] in eu_me_af:
                country.append(i[0])
                date_added.append(row['date_added'])
                cont.append('Europe, Middle East, and Africa')               
            elif i[0] in la:
                country.append(i[0])
                date_added.append(row['date_added'])
                cont.append('Latin America')               
            elif i[0] in ap:
                country.append(i[0])
                date_added.append(row['date_added'])
                cont.append('Asia-Pacific')               
            else:
                pass
        else:
            for j in i:
                j = j.strip()
                if j in us_canada:
                    country.append(j)
                    date_added.append(row['date_added'])
                    cont.append('United States and Canada')
                elif j in eu_me_af:
                    country.append(j)
                    date_added.append(row['date_added'])
                    cont.append('Europe, Middle East, and Africa')                                                        
                elif j in la:
                    country.append(j)
                    date_added.append(row['date_added'])
                    cont.append('Latin America')                                    
                elif j in ap:
                    country.append(j)
                    date_added.append(row['date_added'])
                    cont.append('Asia-Pacific')                                   
                else:
                    pass
            
print(len(country))
print(len(date_added))
print(len(cont))

9894
9894
9894


In [130]:
# 날짜별 대륙 영화 추가된 데이터
cont_df = pd.DataFrame({
    'country' : country,
    'date_added' : [str(date).split('\n')[0][-10:-6] for date in date_added],
    'cont' : cont
}, columns=['date_added', 'cont', 'country'])

cont_df

Unnamed: 0,date_added,cont,country
0,2021,United States and Canada,United States
1,2021,"Europe, Middle East, and Africa",South Africa
2,2021,Asia-Pacific,India
3,2021,United States and Canada,United States
4,2021,"Europe, Middle East, and Africa",Ghana
...,...,...,...
9889,2016,"Europe, Middle East, and Africa",Jordan
9890,2019,United States and Canada,United States
9891,2019,United States and Canada,United States
9892,2020,United States and Canada,United States


In [131]:
# cont_df .csv 파일로 저장하기
cont_df.to_csv('cont_df.csv')

In [133]:
main_df.to_csv('main_df.csv')