# My Own Code for Scraping/Cleaning Netflix Data

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time


In [None]:
# login creds
mail = "x"
pw = "x" # 

driver = webdriver.Chrome()

driver.get("https://www.netflix.com/tr-en/login")

In [None]:
# wait for xpath to load
wait = WebDriverWait(driver, 10) 

email_input = wait.until(EC.element_to_be_clickable((By.XPATH, '//input[@name="userLoginId"]'))) 
password_input = wait.until(EC.element_to_be_clickable((By.XPATH, '//input[@name="password"]')))  

# login
email_input.send_keys(mail)
password_input.send_keys(pw)
password_input.submit()

In [None]:

def select_profile(driver, profile_name):
    try:
        profile_selector = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="appMountPoint"]/div/div/div/div[1]/div[1]/div[2]/div/div/ul'))
        )
        
        profile_elements = profile_selector.find_elements(By.XPATH, './/li//a//span')
        
        # this was just for scraping and debugging and finding the proper part for profiles, not necessary
        print("Available profiles:")
        for profile in profile_elements:
            print(profile.text)
        
        for profile in profile_elements:
            if profile.text.strip() == profile_name: 
                profile.click()
                print(f"Selected profile: {profile_name}")
                return
        print(f"Profile '{profile_name}' not found.")
        
    except TimeoutException:
        print("Failed to find the profile list or the profile selector.")

In [None]:
# select profile and go to history page
profilename = "x"
select_profile(driver, profilename)

time.sleep(7) # 7 sec sleep to login, connection may be slow
viewActivityPage = "https://www.netflix.com/viewingactivity?u=0&p="

# activity page for a specific profile of mine
driver.get(viewActivityPage + profilename)

# wait for loading
history_list = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div/div/div/div[2]/div/div/ul')))


In [None]:
def download_file(driver):
    try:
        download_button_xpath = "/html/body/div[1]/div/div/div/div[2]/div/div/div[2]/div[2]/a[2]"
        
        download_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, download_button_xpath))
        )
        
        download_button.click()
        print("Download started.")
        
        WebDriverWait(driver, 5).until(EC.staleness_of(download_button))
        
        driver.quit()

    except TimeoutException:
        print("Download button not found or clickable.")
        driver.quit()

download_file(driver)


----
Second Phase - Cleaning the data

In [None]:
import pandas as pd
import time

In [None]:

# read csv
csv_file_path = 'NetflixViewingHistory.csv'
netflix_data = pd.read_csv(csv_file_path)

# df
split_columns = pd.DataFrame()

# split seasons
split_columns = netflix_data['Title'].str.extract(r'^(.*?)(?:: Season (\d+))?: (.*)$')
split_columns.columns = ['Series', 'Season', 'Episode']

# movies and shows maybe
split_columns['Series'].fillna(netflix_data['Title'], inplace=True)

# missing vals
split_columns['Season'].fillna('Movie', inplace=True)  
split_columns['Episode'].fillna('Movie', inplace=True)  

# Data Type Conversion 'Season'
split_columns['Season'] = split_columns['Season'].astype('category')

netflix_data_cleaned = pd.concat([netflix_data.drop(columns=['Title']), split_columns], axis=1)

# cleaned dataframe
netflix_data_cleaned
