In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import itertools

In [215]:
df = pd.read_csv("./netflix_titles.csv")

In [216]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [9]:
# 특수문자 및 공백 처리
for_uri = {
            " ": "+", "!": "%21", "\"": "%22", "#": "%23",
           "%": "%25", "&": "%26", "(": "%28", ")": "%29",
           "+": "%2B", "\\": "%5C", ",": "%2C", ".": "%2E",
           "/": "%2F", ":": "%3A", ";": "%3B", "<": "%3C",
           ">": "%3E", "=": "%3D", "?": "%3F", "@": "%40",
           "{": "%7B", "}": "%7D", "[": "%5B", "]": "%5D",
           "|": "%7C", "~": "%7E", "\'": "%27",
          }

# 특수문자 URI에 맞게
def to_uri(keyword):
    
    """
    
    Params: 검색 할 키워드
    Keyword를 URI 패턴에 맞게 공백 및 특수문자를 replace 합니다.
    Return: replace된 키워드
    
    """
    
    new = ""
    for i in keyword:
        try:
            if for_uri[i]:
                new += for_uri[i]

        except KeyError:
            new += i
        
    return new

# REQUESTS를 사용해 URL가져오기
def get_html(url):
    
    """
    
    Params: 홈페이지 URI 
    requests 라이브러리에서 .get 함수를 사용해 URI의 정보를 크롤링합니다. 
    Return: URI의 정보
    
    
    """
    
    _html = url
    
    headers = {
       "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"
    }
    
    resp = requests.get(url, headers = headers)
    if resp.status_code == 200:
        _html = resp.text
    return _html
def find_title_id(keyword):
    
    """
    
    Params: 영화 및 드라마 title
    "https://www.imdb.com/?ref_=nv_home"의 영화 및 드라마 정보를 찾기 위해, 각 title에 대한 IMDB에서 사용한 고유ID를 찾는 함수
    Return: title_id(고유ID)
    
    """
    
    key = to_uri(keyword)
    URL = f"https://www.imdb.com/find?s=tt&q={key}&ref_=nv_sr_sm"
    resq = get_html(URL)
    soup = BeautifulSoup(resq, "html.parser")
    
    try:
        title_id =  soup.select("#main > div > div.findSection > table > tr:nth-of-type(1) > td.result_text > a")[0]["href"].split("/")[2]
        
    except IndexError:
        title_id = "NOT FOUND"
        
        
#     print("Title :", keyword)
#     print("ID :", title_id)
    
    return title_id

- "https://www.imdb.com/?ref_=nv_home" 데이터의 ID값을 찾는 로직

In [221]:
id_title = {}

for keyword in tqdm(df.title.values):
    id_ = find_title_id(keyword)
    id_title[keyword] = id_


  0%|          | 0/8807 [00:00<?, ?it/s]

- ID값을 기반으로 평균 Rating, 세대별, 남녀별 크롤링 후 .CSV 파일로 파싱하기

In [229]:
ratings = {
        "Title" : [],
        "Avg" : [],
        "Total": [],
        "Total_males" : [],
        "Total_Females" : [],
        "All_below_18": [],
        "All_18_29": [],
        "All_30_44": [],
        "All_over_45": [],
        "Males_below_18": [],
        "Males_18_29": [],
        "Males_30_44": [],
        "Males_over_45": [],
        "Females_below_18": [],
        "Females_18_29": [],
        "Females_30_44": [],
        "Females_over_45": []
          }


for key, id_ in tqdm(id_title.items()):

    
    URL = f"https://www.imdb.com/title/{id_}/ratings/?ref_=tt_ov_rt"
    resq = get_html(URL)
    soup = BeautifulSoup(resq, "html.parser")
    
    print("Title :", key)
    print("ID :", id_)
    
    try:
        avg_rating = soup.select_one("#main > section > div > div.subpage_title_block > div > div.ipl-rating-widget > div.ipl-rating-star > span.ipl-rating-star__rating").text
    except:
        avg_rating = 0
        
    try:
        total_cnt_rating = soup.select_one("#main > section > div > div.allText > div > table:nth-of-type(2) > tr:nth-of-type(2) > td:nth-of-type(2) > div.smallcell > a ").text.strip()
    except:
        total_cnt_rating = 0
        
    try:
        total_males_rating = soup.select_one("#main > section > div > div.allText > div > table:nth-of-type(2) > tr:nth-of-type(3) > td:nth-of-type(2) > div.smallcell > a ").text.strip()
    except:
        total_males_rating = 0
        
    try:
        total_Females_rating = soup.select_one("#main > section > div > div.allText > div > table:nth-of-type(2) > tr:nth-of-type(4) > td:nth-of-type(2) > div.smallcell > a ").text.strip()
    except:
        total_Females_rating = 0
    
    sexes = ["all", "males", "females"]
    ages = ["below_18", "18_to_29", "30_to_44", "over_45"]

    for sex_age, i_j in zip(itertools.product(sexes, ages), itertools.product(range(2,5),range(3,7))):
        sex, age = sex_age
        i,j = i_j
        
#         print(sex,age)
#         print(i,j)
        try:
            globals()[f"{sex}_{age}"] = soup.select_one(f"#main > section > div > div.allText > div > table:nth-of-type(2) > tr:nth-of-type({i}) > td:nth-of-type({j}) > div.smallcell > a ").text.strip()
        except:
            globals()[f"{sex}_{age}"] = 0
    nums = ["one", "two", "three", "four", "five", "six", "seven"," eight", "nine", "ten"]

    for i,num in enumerate(nums[::-1]):
        
        try:
            globals()[f"{num}_ranting"] = soup.select_one(f"#main > section > div > div.allText > div > table:nth-of-type(1) > tr:nth-of-type({i+2}) > td:nth-of-type(3) > div:nth-of-type(1)").text.strip()
        except:
            globals()[f"{num}_ranting"] = 0
        
    ratings["Title"].append(key)
    ratings["Avg"].append(avg_rating)
    ratings["Total"].append(total_cnt_rating)
    ratings["Total_males"].append(total_males_rating)
    ratings["Total_Females"].append(total_Females_rating)
    
    ratings["All_below_18"].append(all_below_18)
    ratings["All_18_29"].append(all_18_to_29)
    ratings["All_30_44"].append(all_30_to_44)
    ratings["All_over_45"].append(all_over_45)
    
    ratings["Males_below_18"].append(males_below_18)
    ratings["Males_18_29"].append(males_18_to_29)
    ratings["Males_30_44"].append(males_30_to_44)
    ratings["Males_over_45"].append(males_over_45)
    
    ratings["Females_below_18"].append(females_below_18)
    ratings["Females_18_29"].append(females_18_to_29)
    ratings["Females_30_44"].append(females_30_to_44)
    ratings["Females_over_45"].append(females_over_45)
    
    
    

  0%|          | 0/8807 [00:00<?, ?it/s]

Title : Dick Johnson Is Dead
ID : tt11394180
Title : Blood & Water
ID : tt9839146
Title : Ganglands
ID : tt13278100
Title : Jailbirds New Orleans
ID : tt15320436
Title : Kota Factory
ID : tt9432978
Title : Midnight Mass
ID : tt10574558
Title : My Little Pony: A New Generation
ID : tt10101702
Title : Sankofa
ID : tt0108041
Title : The Great British Baking Show
ID : tt1877368
Title : The Starling
ID : tt5164438
Title : Vendetta: Truth, Lies and The Mafia
ID : tt14216574
Title : Bangkok Breaking
ID : tt14202282
Title : Je Suis Karl
ID : tt9205538
Title : Confessions of an Invisible Girl
ID : tt15204288
Title : Crime Stories: India Detectives
ID : tt14178956
Title : Dear White People
ID : tt5707802
Title : Europe's Most Dangerous Man: Otto Skorzeny in Spain
ID : tt11564588
Title : Falsa identidad
ID : tt3602528
Title : Intrusion
ID : tt5563324
Title : Jaguar
ID : tt11698590
Title : Monsters Inside: The 24 Faces of Billy Milligan
ID : tt15287836
Title : Resurrection: Ertugrul
ID : tt4320258

In [239]:
def find_id(x):
    id_ = id_title[x]
    return id_

df["Title_id"] =  df["Title"].apply(find_id)

Unnamed: 0,Title,Avg,Total,Total_males,Total_Females,All_below_18,All_18_29,All_30_44,All_over_45,Males_below_18,Males_18_29,Males_30_44,Males_over_45,Females_below_18,Females_18_29,Females_30_44,Females_over_45,Title_id
34,Tayo and Little Wizards,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,NOT FOUND
77,Little Singham - Black Shadow,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,NOT FOUND
79,Tughlaq Durbar (Telugu),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,NOT FOUND
237,Boomika (Hindi),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,NOT FOUND
238,Boomika (Malayalam),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,NOT FOUND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8350,The Human Factor: The Untold Story of the Bomb...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,NOT FOUND
8419,The Memphis Belle: A Story of a\nFlying Fortress,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,NOT FOUND
8429,The Most Beautiful Hands of Delhi,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,NOT FOUND
8774,يوم الدين,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,NOT FOUND


- .CSV파일로 반환

In [3]:
df = pd.read_csv("../data/ratings_df.csv")
df.drop("Unnamed: 0", axis =1 , inplace= True)

In [50]:
rank_ratings = {
        "All_rank_below_18": [],
        "All_rank_18_29": [],
        "All_rank_30_44": [],
        "All_rank_over_45": [],
        "Males_rank_below_18": [],
        "Males_rank_18_29": [],
        "Males_rank_30_44": [],
        "Males_rank_over_45": [],
        "Females_rank_below_18": [],
        "Females_rank_18_29": [],
        "Females_rank_30_44": [],
        "Females_rank_over_45": []
          }


for key, id_ in tqdm(zip(df["Title"].values, df["Title_ID"].values)):

    URL = f"https://www.imdb.com/title/{id_}/ratings/?ref_=tt_ov_rt"
    resq = get_html(URL)
    soup = BeautifulSoup(resq, "html.parser")
    
    sexes = ["all_rank", "males_rank", "females_rank"]
    ages = ["below_18", "18_to_29", "30_to_44", "over_45"]

    for sex_age, i_j in zip(itertools.product(sexes, ages), itertools.product(range(2,5),range(3,7))):
        sex, age = sex_age
        i,j = i_j
        
#         print(sex,age)
#         print(i,j)
#         print(soup.select_one(f"#main > section > div > div.allText > div > table:nth-of-type(2) > tr:nth-of-type({i}) > td:nth-of-type({j}) > div.bigcell").text.strip())
        try:
            globals()[f"{sex}_{age}"] = soup.select_one(f"#main > section > div > div.allText > div > table:nth-of-type(2) > tr:nth-of-type({i}) > td:nth-of-type({j}) > div.bigcell").text.strip()
        except:
            globals()[f"{sex}_{age}"] = 0
    
    rank_ratings["All_rank_below_18"].append(all_rank_below_18)
    rank_ratings["All_rank_18_29"].append(all_rank_18_to_29)
    rank_ratings["All_rank_30_44"].append(all_rank_30_to_44)
    rank_ratings["All_rank_over_45"].append(all_rank_over_45)

    rank_ratings["Males_rank_below_18"].append(males_rank_below_18)
    rank_ratings["Males_rank_18_29"].append(males_rank_18_to_29)
    rank_ratings["Males_rank_30_44"].append(males_rank_30_to_44)
    rank_ratings["Males_rank_over_45"].append(males_rank_over_45)

    rank_ratings["Females_rank_below_18"].append(females_rank_below_18)
    rank_ratings["Females_rank_18_29"].append(females_rank_18_to_29)
    rank_ratings["Females_rank_30_44"].append(females_rank_30_to_44)
    rank_ratings["Females_rank_over_45"].append(females_rank_over_45)

0it [00:00, ?it/s]

In [51]:
rat_df = pd.DataFrame(rank_ratings)
rat_df.to_csv("only_ratings.csv", index=False)

In [54]:
ratings_only = pd.read_csv("../data/ratings_df.csv")

In [60]:
final_ratings  = pd.concat([ratings_only, rat_df], axis = 1)
final_ratings.drop("Unnamed: 0", axis = 1, inplace = True)


In [62]:
final_ratings.to_csv("final_ratings.csv", index=False)

Unnamed: 0,All_rank_below_18,All_rank_18_29,All_rank_30_44,All_rank_over_45,Males_rank_below_18,Males_rank_18_29,Males_rank_30_44,Males_rank_over_45,Females_rank_below_18,Females_rank_18_29,Females_rank_30_44,Females_rank_over_45
0,7.6,7.7,7.4,7.1,7.9,7.7,7.4,7.1,8.0,7.8,7.4,7.3
1,5.7,6.9,6.5,5.8,5.5,6.6,5.9,4.9,5.0,6.9,7.0,7.0
2,6.0,6.8,6.9,7.0,6.0,6.9,6.8,7.0,-,5.8,6.9,7.0
3,-,6.3,6.6,6.8,-,6.4,6.3,6.6,-,6.2,6.9,6.9
4,9.1,9.2,8.5,6.3,9.6,9.2,8.5,6.4,4.3,9.2,7.5,5.7
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,8.1,7.9,7.6,7.5,8.2,7.9,7.7,7.5,7.6,7.7,7.4,7.5
8803,-,3.5,8.0,10.0,-,3.5,8.0,-,-,-,-,10.0
8804,7.7,7.7,7.6,7.4,7.7,7.7,7.6,7.4,7.2,7.6,7.6,7.6
8805,3.1,4.3,4.1,4.7,3.1,4.0,3.9,4.6,-,5.2,4.9,5.5


### final_ratings 간단한 전처리

In [69]:
df = pd.read_csv("../data/final_ratings.csv")

# float으로 바꾸기 위해 ","을 제거
df["Total"] = df["Total"].apply(lambda x: x.replace(",",""))
df["Total_males"] = df["Total_males"].apply(lambda x: x.replace(",",""))
df["Total_Females"] = df["Total_Females"].apply(lambda x: x.replace(",",""))

df["All_below_18"] = df["All_below_18"].apply(lambda x: x.replace(",",""))
df["All_18_29"] = df["All_18_29"].apply(lambda x: x.replace(",",""))
df["All_30_44"] = df["All_30_44"].apply(lambda x: x.replace(",",""))
df["All_over_45"] = df["All_over_45"].apply(lambda x: x.replace(",",""))

df["Males_below_18"] = df["Males_below_18"].apply(lambda x: x.replace(",",""))
df["Males_18_29"] = df["Males_18_29"].apply(lambda x: x.replace(",",""))
df["Males_30_44"] = df["Males_30_44"].apply(lambda x: x.replace(",",""))
df["Males_over_45"] = df["Males_over_45"].apply(lambda x: x.replace(",",""))

# df["Females_below_18"] = df["Females_below_18"].apply(lambda x: x.replace(",",""))
df["Females_18_29"] = df["Females_18_29"].apply(lambda x: x.replace(",",""))
df["Females_30_44"] = df["Females_30_44"].apply(lambda x: x.replace(",",""))
df["Females_over_45"] = df["Females_over_45"].apply(lambda x: x.replace(",",""))



In [75]:
 cols = ['All_rank_below_18', 'All_rank_18_29', 'All_rank_30_44',
       'All_rank_over_45', 'Males_rank_below_18', 'Males_rank_18_29',
       'Males_rank_30_44', 'Males_rank_over_45', 'Females_rank_below_18',
       'Females_rank_18_29', 'Females_rank_30_44', 'Females_rank_over_45']

def removed(x):
    if x == "-":
        x = 0
        return x
    return x

for col in cols:
    df[col] = df[col].apply(removed)

In [78]:
cols = ['Avg', 'Total', 'Total_males', 'Total_Females', 'All_below_18',
       'All_18_29', 'All_30_44', 'All_over_45', 'Males_below_18',
       'Males_18_29', 'Males_30_44', 'Males_over_45', 'Females_below_18',
       'Females_18_29', 'Females_30_44', 'Females_over_45', 
       'All_rank_below_18', 'All_rank_18_29', 'All_rank_30_44',
       'All_rank_over_45', 'Males_rank_below_18', 'Males_rank_18_29',
       'Males_rank_30_44', 'Males_rank_over_45', 'Females_rank_below_18',
       'Females_rank_18_29', 'Females_rank_30_44', 'Females_rank_over_45']

df[cols] = df[cols].astype(np.float32)

In [97]:
df.to_csv("../data/final_netflix_df.csv", index= False)

In [98]:
pd.read_csv("../data/final_netflix_df.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Title                  8807 non-null   object 
 1   Avg                    8807 non-null   float64
 2   Total                  8807 non-null   float64
 3   Total_males            8807 non-null   float64
 4   Total_Females          8807 non-null   float64
 5   All_below_18           8807 non-null   float64
 6   All_18_29              8807 non-null   float64
 7   All_30_44              8807 non-null   float64
 8   All_over_45            8807 non-null   float64
 9   Males_below_18         8807 non-null   float64
 10  Males_18_29            8807 non-null   float64
 11  Males_30_44            8807 non-null   float64
 12  Males_over_45          8807 non-null   float64
 13  Females_below_18       8807 non-null   float64
 14  Females_18_29          8807 non-null   float64
 15  Fema