<a href="https://colab.research.google.com/github/kaushikykk/scaler/blob/main/Netflixnewforkaushik.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ace_tools as tools
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('netflix_titles.csv')
df.head()

In [None]:
import pandas as pd
from IPython.display import display

class DataFrameOps:
    def __init__(self, file_path):
        """
        Initializes the class with a CSV or Excel file and creates a DataFrame.

        :param file_path: str, file path to the CSV or Excel file.
        """
        if file_path.endswith('.csv'):
            self.df = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            self.df = pd.read_excel(file_path)
        else:
            raise ValueError("File must be either a CSV or Excel file.")

    def summarize(self):
        """
        Summarizes the DataFrame by providing:
        - Size of the DataFrame (rows, columns)
        - Columns with no missing values
        - Columns where all values are unique (no duplicates)
        - Standardizes datetime columns to a common format
        """
        # Get DataFrame shape
        num_rows, num_columns = self.df.shape

        # Identify columns with no null values
        no_null_cols = [col for col in self.df.columns if self.df[col].isna().sum() == 0]

        # Identify columns where all values are unique (no duplicates)
        no_duplicates_cols = [col for col in self.df.columns if self.df[col].nunique() == num_rows]

        # Convert datetime columns to a standardized format if possible
        for col in self.df.select_dtypes(include=['object']).columns:
            try:
                self.df[col] = pd.to_datetime(self.df[col], errors='coerce')
            except Exception:
                pass  # Ignore errors in conversion

        return {
            'Size': (num_rows, num_columns),
            'Number of Rows': num_rows,
            'Number of Columns': num_columns,
            'Columns with No Nulls': no_null_cols,
            'Columns with No Duplicates': no_duplicates_cols
        }

    def generate_metadata(self):
        """
        Generates metadata of the DataFrame, including:
        - Data type of each column
        - Whether the column has null values
        - Whether the column has duplicate values
        """
        metadata = pd.DataFrame({
            'Column': self.df.columns,
            'Data Type': [str(self.df[col].dtype) for col in self.df.columns],
            'Has Nulls': ['Yes' if self.df[col].isna().any() else 'No' for col in self.df.columns],
            'Has Duplicates': ['Yes' if self.df[col].duplicated().any() else 'No' for col in self.df.columns]
        })

        return metadata


# Replace with your actual file path
file_path = "netflix_titles.csv"  # Provide the correct path to your CSV or Excel file

# Instantiate the class
df_ops = DataFrameOps(file_path)

# Generate metadata and display it as a table
metadata_df = df_ops.generate_metadata()
display(metadata_df)

# Generate summary data and clean it for display
summary_data = df_ops.summarize()
summary_data_cleaned = {
    'Size': str(summary_data['Size']),
    'Number of Rows': summary_data['Number of Rows'],
    'Number of Columns': summary_data['Number of Columns'],
    'Columns with No Nulls': ', '.join(summary_data['Columns with No Nulls']),
    'Columns with No Duplicates': ', '.join(summary_data['Columns with No Duplicates'])
}

# Creating a DataFrame from the cleaned summary data
summary_df = pd.DataFrame(list(summary_data_cleaned.items()), columns=['Metric', 'Value'])

# Display summary DataFrame
display(summary_df)



In [None]:
import pandas as pd
from IPython.display import display

class DataFrameOps:
    def __init__(self, file_path):
        """
        Initializes the class with a CSV or Excel file and creates a DataFrame.

        :param file_path: str, file path to the CSV or Excel file.
        """
        if file_path.endswith('.csv'):
            self.df = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            self.df = pd.read_excel(file_path)
        else:
            raise ValueError("File must be either a CSV or Excel file.")

    def summarize(self):
        """
        Summarizes the DataFrame by providing:
        - Size of the DataFrame (rows, columns)
        - Columns with no missing values
        - Columns where all values are unique (no duplicates)
        - Standardizes datetime columns to a common format
        """
        # Get DataFrame shape
        num_rows, num_columns = self.df.shape

        # Identify columns with no null values
        no_null_cols = [col for col in self.df.columns if self.df[col].isna().sum() == 0]

        # Identify columns where all values are unique (no duplicates)
        no_duplicates_cols = [col for col in self.df.columns if self.df[col].nunique() == num_rows]

        # Convert datetime columns to a standardized format if possible
        for col in self.df.select_dtypes(include=['object']).columns:
            try:
                self.df[col] = pd.to_datetime(self.df[col], errors='coerce')
            except Exception:
                pass  # Ignore errors in conversion

        return {
            'Size': (num_rows, num_columns),
            'Number of Rows': num_rows,
            'Number of Columns': num_columns,
            'Columns with No Nulls': no_null_cols,
            'Columns with No Duplicates': no_duplicates_cols
        }

    def generate_metadata(self):
        """
        Generates metadata of the DataFrame, including:
        - Data type of each column
        - Whether the column has null values
        - Whether the column has duplicate values
        """
        metadata = pd.DataFrame({
            'Column': self.df.columns,
            'Data Type': [str(self.df[col].dtype) for col in self.df.columns],
            'Has Nulls': ['Yes' if self.df[col].isna().any() else 'No' for col in self.df.columns],
            'Has Duplicates': ['Yes' if self.df[col].duplicated().any() else 'No' for col in self.df.columns]
        })

        return metadata

    def explode_columns(self, columns_to_explode):
        """
        Explodes the specified list of columns while ensuring index integrity.

        :param columns_to_explode: list, columns that need to be exploded.
        :return: DataFrame with exploded columns.
        """
        if not all(col in self.df.columns for col in columns_to_explode):
            raise ValueError("One or more specified columns are not present in the DataFrame.")

        # Explode multiple columns together and reset the index
        self.df = self.df.explode(columns_to_explode).reset_index(drop=True)

        return self.df

# Example Usage

# Replace with your actual file path
file_path = "netflix_titles.csv"  # Provide the correct path to your CSV or Excel file

# Instantiate the class
df_ops = DataFrameOps(file_path)

# List of columns to explode
# columns_to_explode = ['cast','director']  # Change as per your dataset
# explode_columns = df_ops.explode_columns(columns_to_explode)
# display(explode_columns)
# Explode columns
df_exploded = df_ops.explode_columns(columns_to_explode)

# Display exploded DataFrame
display(df_exploded)

# Generate metadata and display it as a table
metadata_df = df_ops.generate_metadata()
display(metadata_df)

# Generate summary data and clean it for display
summary_data = df_ops.summarize()
summary_data_cleaned = {
    'Size': str(summary_data['Size']),
    'Number of Rows': summary_data['Number of Rows'],
    'Number of Columns': summary_data['Number of Columns'],
    'Columns with No Nulls': ', '.join(summary_data['Columns with No Nulls']),
    'Columns with No Duplicates': ', '.join(summary_data['Columns with No Duplicates'])
}

# Creating a DataFrame from the cleaned summary data
summary_df = pd.DataFrame(list(summary_data_cleaned.items()), columns=['Metric', 'Value'])

# Display summary DataFrame
# display(summary_df)
columns_to_explode = ['cast' ]  # Change as per your dataset
explode_columns = df_ops.explode_columns(columns_to_explode)
display(explode_columns)



In [None]:
def explode_columns(self, columns_to_explode):
        """
        Explodes the specified list of columns while ensuring index integrity.

        :param columns_to_explode: list, columns that need to be exploded.
        :return: DataFrame with exploded columns.
        """
        if not all(col in self.df.columns for col in columns_to_explode):
            raise ValueError("One or more specified columns are not present in the DataFrame.")

        # Explode multiple columns together and reset the index
        self.df = self.df.explode(columns_to_explode).reset_index(drop=True)

        return self.df
columns_to_explode = ['cast' ]  # Change as per your dataset
explode_columns = df_ops.explode_columns(columns_to_explode)
display(explode_columns)

In [None]:
df
df_exploded = df.explode(['cast']).reset_index(drop=True)
df_exploded


In [None]:
import pandas as pd

class DataFrameOps:
    def __init__(self, file_path):
        """
        Initializes the class with a CSV or Excel file and creates a DataFrame.

        :param file_path: str, file path to the CSV or Excel file.
        """
        if file_path.endswith('.csv'):
            self.df = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            self.df = pd.read_excel(file_path)
        else:
            raise ValueError("File must be either a CSV or Excel file.")

    def explode_columns(self, columns_to_explode):
        """
        Explodes the specified list of columns while ensuring non-exploded columns remain intact.

        :param columns_to_explode: list of columns to be exploded.
        :return: DataFrame with exploded columns.
        """
        if not all(col in self.df.columns for col in columns_to_explode):
            raise ValueError("One or more specified columns are not present in the DataFrame.")

        # Convert specified columns to lists if they are comma-separated strings
        for col in columns_to_explode:
            self.df[col] = self.df[col].apply(lambda x: x.split(', ') if isinstance(x, str) else [None])

        # Explode multiple columns simultaneously and reset index
        self.df = self.df.explode(columns_to_explode).reset_index(drop=True)

        # Forward-fill missing values to maintain alignment in non-exploded columns
        non_exploded_cols = [col for col in self.df.columns if col not in columns_to_explode]
        self.df[non_exploded_cols] = self.df[non_exploded_cols].ffill()

        return self.df

# Example Usage

# Replace with your actual file path
file_path = ""  # Provide the correct path to your CSV or Excel file

# Instantiate the class
df_ops = DataFrameOps(file_path)

# List of columns to explode
columns_to_explode = ['director', 'cast']  # Change as per your dataset

# Explode columns
df_exploded = df_ops.explode_columns(columns_to_explode)

# Display exploded DataFrame
import ace_tools as tools
tools.display_dataframe_to_user(name="Exploded DataFrame", dataframe=df_exploded)


In [None]:
# Import necessary libraries
import pandas as pd

class DataFrameOps:
    def __init__(self, file_path):
        """
        Initializes the class with a CSV or Excel file and creates a DataFrame.

        :param file_path: str, file path to the CSV or Excel file.
        """
        if file_path.endswith('.csv'):
            self.df = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            self.df = pd.read_excel(file_path)
        else:
            raise ValueError("File must be either a CSV or Excel file.")

    def explode_columns(self, columns_to_explode):
        """
        Explodes the specified list of columns while ensuring non-exploded columns remain intact.

        :param columns_to_explode: list of columns to be exploded.
        :return: DataFrame with exploded columns.
        """
        if not all(col in self.df.columns for col in columns_to_explode):
            raise ValueError("One or more specified columns are not present in the DataFrame.")

        # Convert specified columns to lists if they are comma-separated strings
        for col in columns_to_explode:
            self.df[col] = self.df[col].apply(lambda x: x.split(', ') if isinstance(x, str) else [None])

        # Explode multiple columns simultaneously and reset index
        self.df = self.df.explode(columns_to_explode).reset_index(drop=True)

        # Forward-fill missing values to maintain alignment in non-exploded columns
        non_exploded_cols = [col for col in self.df.columns if col not in columns_to_explode]
        self.df[non_exploded_cols] = self.df[non_exploded_cols].ffill()

        return self.df

# Use the given CSV file
file_path = "netflix_titles.csv"

# Instantiate the class
df_ops = DataFrameOps(file_path)

# List of columns to explode
columns_to_explode = ['director', 'cast']  # Adjust as needed

# Explode columns
df_exploded = df_ops.explode_columns(columns_to_explode)

# Return the new DataFrame with exploded data and other columns intact
df_exploded.head()  # Displaying only the first few rows for verification


In [None]:
import pandas as pd

class DataFrameOps:
    def __init__(self, file_path):
        """
        Initializes the class with a CSV or Excel file and creates a DataFrame.

        :param file_path: str, file path to the CSV or Excel file.
        """
        if file_path.endswith('.csv'):
            self.df = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            self.df = pd.read_excel(file_path)
        else:
            raise ValueError("File must be either a CSV or Excel file.")

    def explode_columns(self, columns_to_explode):
        """
        Explodes the specified list of columns while ensuring non-exploded columns remain intact.

        :param columns_to_explode: list of columns to be exploded.
        :return: DataFrame with exploded columns.
        """
        if not all(col in self.df.columns for col in columns_to_explode):
            raise ValueError("One or more specified columns are not present in the DataFrame.")

        # Convert specified columns to lists if they are comma-separated strings
        for col in columns_to_explode:
            self.df[col] = self.df[col].apply(lambda x: x.split(', ') if isinstance(x, str) else [None])

        # Ensure all lists have the same length by padding with None
        max_length = self.df[columns_to_explode].applymap(len).max(axis=1)  # Find max length per row
        for col in columns_to_explode:
            self.df[col] = self.df[col].apply(lambda x: x + [None] * (max_length.loc[x.name] - len(x)))

        # Explode multiple columns together and reset index
        self.df = self.df.explode(columns_to_explode).reset_index(drop=True)

        # Forward-fill missing values to maintain alignment in non-exploded columns
        non_exploded_cols = [col for col in self.df.columns if col not in columns_to_explode]
        self.df[non_exploded_cols] = self.df[non_exploded_cols].ffill()

        return self.df

# Use the given CSV file
file_path = "netflix_titles.csv"

# Instantiate the class
df_ops = DataFrameOps(file_path)

# List of columns to explode
columns_to_explode = ['director', 'cast']  # Adjust as needed

# Explode columns safely
df_exploded = df_ops.explode_columns(columns_to_explode)

# Return the new DataFrame with exploded data and other columns intact
df_exploded.head()  # Displaying only the first few rows for verification


In [None]:
import pandas as pd

class DataFrameOps:
    def __init__(self, file_path):
        """
        Initializes the class with a CSV or Excel file and creates a DataFrame.

        :param file_path: str, file path to the CSV or Excel file.
        """
        if file_path.endswith('.csv'):
            self.df = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            self.df = pd.read_excel(file_path)
        else:
            raise ValueError("File must be either a CSV or Excel file.")

    def explode_columns(self, columns_to_explode):
        """
        Explodes the specified list of columns while ensuring non-exploded columns remain intact.

        :param columns_to_explode: list of columns to be exploded.
        :return: DataFrame with exploded columns.
        """
        if not all(col in self.df.columns for col in columns_to_explode):
            raise ValueError("One or more specified columns are not present in the DataFrame.")

        # Convert specified columns to lists if they are comma-separated strings
        for col in columns_to_explode:
            self.df[col] = self.df[col].apply(lambda x: x.split(', ') if isinstance(x, str) else [None])

        # Ensure all lists have the same length by padding with None
        max_length = self.df[columns_to_explode].map(len).max(axis=1)  # Find max length per row
        for col in columns_to_explode:
            self.df[col] = self.df.apply(lambda row: row[col] + [None] * (max_length[row.name] - len(row[col])), axis=1)

        # Explode multiple columns together and reset index
        self.df = self.df.explode(columns_to_explode).reset_index(drop=True)

        # Forward-fill missing values to maintain alignment in non-exploded columns
        non_exploded_cols = [col for col in self.df.columns if col not in columns_to_explode]
        self.df[non_exploded_cols] = self.df[non_exploded_cols].ffill()

        return self.df

# Use the given CSV file
file_path = "netflix_titles.csv"

# Instantiate the class
df_ops = DataFrameOps(file_path)

# List of columns to explode
columns_to_explode = ['director', 'cast', 'country', 'listed_in']  # Adjust as needed

# Explode columns safely
df_exploded = df_ops.explode_columns(columns_to_explode)

# Return the new DataFrame with exploded data and other columns intact
df_exploded.head()  # Displaying only the first few rows for verification


In [None]:
df[df['title'] == 'Blood & Water']

In [None]:
# df_exploded.info()
df_exploded.size



In [None]:
df_act_dir = df_exploded.set_index('show_id')[['director', 'cast']]
df_act_dir = df_exploded.set_index('show_id')[['director', 'cast']].nunique()
df_act_dir


In [None]:
# Remove duplicate entries based on 'show_id' to ensure unique shows
df_unique = df_exploded.drop_duplicates(subset=['show_id'])

# Create a subset DataFrame with show_id as the index, containing director and cast
df_act_dir = df_unique.set_index('show_id')[['director', 'cast', 'title']]

# Drop rows where either director or cast is NaN
df_act_dir = df_act_dir.dropna()

# Count occurrences of each (director, actor) pair
pair_counts = df_act_dir.groupby(['director', 'cast']).size().reset_index(name='Number of Shows')

# Get the most frequent director-actor pair
top_pair = pair_counts.sort_values(by='Number of Shows', ascending=False).head(5)

# Display the results
top_pair


In [None]:
# Create a subset DataFrame with show_id as the index, containing director, cast, and title
df_act_dir = df_unique.set_index('show_id')[['director', 'cast', 'title']]

# Drop rows where either director or cast is NaN
df_act_dir = df_act_dir.dropna()

# Count occurrences of each (director, actor) pair and include titles
pair_counts = df_act_dir.groupby(['director', 'cast']).agg({
    'title': lambda x: list(x),  # Collect titles as a list
    'title': 'count'  # Count number of shows
}).reset_index().rename(columns={'title': 'Number of Shows'})

# Get the top 5 most frequent director-actor pairs
top_pairs = pair_counts.sort_values(by='Number of Shows', ascending=False).head(5)

# Merge with original dataset to include the list of titles
top_pairs_with_titles = df_act_dir.groupby(['director', 'cast'])['title'].agg(list).reset_index()

# Merge the counts with the titles
final_top_pairs = top_pairs.merge(top_pairs_with_titles, on=['director', 'cast'], how='left')

# Display the results
final_top_pairs
