In [None]:
import numpy as np
import pandas as pd

# Loading the dataset from the local file
file_path = '/content/sample_data/netlfix.csv'
df = pd.read_csv(file_path)

# Displaying basic information about the dataset
print("Original Dataset Info:")
print(df.info())

# Handles missing values
df = df.dropna()  # Drops rows with missing values

# Removes outliers
release_year_threshold = 100
df = df[df['release_year'] < release_year_threshold]

duration_threshold = 120
df = df[df['duration'] <= duration_threshold]

# Handle missing values for string columns
string_columns = ['director', 'country', 'listed_in','show_id', 'type', 'title', 'date_added', 'rating']
df[string_columns] = df[string_columns].fillna('Unknown')

df['director'] = df['director'].str.strip().str.lower()

df['country'] = df['country'].str.strip().str.lower()

df['listed_in'] = df['listed_in'].str.replace(',', '').str.lower()

df['show_id'] = df['show_id'].str.strip().str.lower()

df['type'] = df['type'].str.strip().str.lower()

df['title'] = df['title'].str.strip().str.lower()

df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

df['rating'] = df['rating'].str.upper()

# Displaying basic information about the cleaned dataset
print("\nCleaned Dataset Info:")
print(df.info())

# Save the cleaned dataset to a new CSV file
cleaned_file_path = "/content/sample_data/netlfix.csv"
df.to_csv(cleaned_file_path, index=False)

print("Dataset cleaning and preprocessing completed.")


Original Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       0 non-null      object
 1   type          0 non-null      object
 2   title         0 non-null      object
 3   director      0 non-null      object
 4   country       0 non-null      object
 5   date_added    0 non-null      object
 6   release_year  0 non-null      object
 7   rating        0 non-null      object
 8   duration      0 non-null      object
 9   listed_in     0 non-null      object
dtypes: object(10)
memory usage: 0.0+ bytes
None

Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       0 non-null      object        
 1   type          0 non-null      object        
 2   title         0 non-