In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
import seaborn as sns
from scipy.stats import boxcox
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
data=pd.read_csv("DATA/netflix_titles.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'DATA/netflix_titles.csv'

In [None]:
display(data)

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.columns

## Checking and solving for duplicates

In [None]:
duplicate=data.duplicated()
duplicate_count=sum(data.duplicated())
print("there are",duplicate_count,"duplicates in the dataset")
if duplicate_count>0:
    print("duplicate data: ",data[duplicate])

In [None]:
# Prints the shape of data before removal of duplicate
print("Data shape before duplicate removal:", data.shape)

# Deletes duplicate
data.drop_duplicates(inplace=True)

# Prints the shape of data before removal of duplicate
print("Data shape after duplicate removal:", data.shape)

## Checking for missing values and Dealing with missing values

In [None]:
data.nunique()

In [None]:
missing_values=data.isna().sum()
print(missing_values[missing_values > 0])

In [None]:
data.replace("?", np.nan, inplace=True)
# Separate numeric and categorical columns
numeric_columns = data.select_dtypes(include=np.number).columns
categorical_columns = data.select_dtypes(exclude=np.number).columns

# Handle missing values in numeric columns
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Handle missing values in categorical columns
imputer = SimpleImputer(strategy='most_frequent')
data[categorical_columns] = imputer.fit_transform(data[categorical_columns])
missing_value=data.isna().sum()
print(missing_value[missing_value > 0])

In [None]:
data

In [None]:
data["type"].value_counts()

In [None]:
# demonstrating in the form of pie chart
labels=["Movies","TV Show"]
size=data["type"].value_counts()
colors = ['#FF6D00', '#FFD600']
explode=[0,0.2]
plt.rcParams["figure.figsize"]=(5,5)
plt.pie(size,labels = labels, colors= colors, explode = explode, shadow= True, startangle=15,autopct="%1.2f")
plt.title("Distribution of Movies and TV Shows", fontsize=15)
plt.legend()
plt.show()

In [None]:
s=(data.dtypes=='object')
cat_cols=list(s[s].index)
cat_cols


In [None]:
low_cardinality_cols = [cname for cname in cat_cols if data[cname].nunique() < 30]
low_cardinality_cols

In [None]:
# cleaning up of missing data
data=data.dropna()


## Fixing Outliers

In [None]:
# fixing outliers using box-cox transformation

# Select the numeric columns to apply Box-Cox transformation
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns

# Loop through each numeric column and apply Box-Cox transformation
for column in numeric_columns:
    # Check the distribution of the original data
    plt.hist(data[column], bins=20)
    plt.title("Original Data Distribution - {}".format(column))
    plt.show()

    # Apply Box-Cox transformation
    transformed_data, lambda_value = boxcox(data[column])

    # Check the distribution of the transformed data
    plt.hist(transformed_data, bins=20)
    plt.title("Transformed Data Distribution - {}".format(column))
    plt.show()

    # Print the optimal lambda value
    print("Optimal lambda value for {}: {}".format(column, lambda_value))

## Fixing errors, naming conventions




In [None]:
# Standardize numeric features
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Encode categorical features
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Check the preprocessed dataset
print(data.describe())


## Data validation

In [None]:
valid_date_format = "%Y-%m-%d"
invalid_dates = []
movie_data=data.copy()
for date in movie_data["release_year"]:
    try:
        pd.to_datetime(date, format=valid_date_format)
    except ValueError:
        invalid_dates.append(date)
if invalid_dates:
    print(f"Invalid release dates found: {', '.join(invalid_dates)}")
else:
    print("The given dataset is validated as the year is in the specified format")

In [None]:
netflix_data=data.copy()

In [None]:
netflix_data.columns

## SVM Classifier

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC,LinearSVC
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Select the relevant variables for SVM classification
selected_columns = ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
                     'release_year', 'rating', 'duration', 'listed_in', 'description']

# Subset the data with the selected columns
data_1 = netflix_data[selected_columns]

In [None]:
# Split the data into training and testing sets
X = data_1.drop('type', axis=1) 
y = data_1['type'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_unique_labels, train_unique_label_count = np.unique(y_train, return_counts=True)

test_unique_labels, test_unique_label_count = np.unique(y_test, return_counts=True)

print("Train set distribution:\n")
print(train_unique_labels, np.round(train_unique_label_count/X_train.shape[0], 2))

print("\nTest set distribution:\n")
print(test_unique_labels, np.round(test_unique_label_count/X_test.shape[0], 2))

In [None]:
polynomial_svm_pipeline = Pipeline([
    ("poly_features", PolynomialFeatures(degree=3)),
    ("scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=1, loss="hinge"))
    ])
polynomial_svm_cross_val_scores = cross_val_score(
    polynomial_svm_pipeline, X_train, y_train, scoring="accuracy", cv=5, n_jobs=-1)

In [None]:
print("Mean Score: {} \nMean Score Std. Dev.: {}".format(
    np.mean(polynomial_svm_cross_val_scores), 
    np.std(polynomial_svm_cross_val_scores)))

In [None]:
model = SVC()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)

In [None]:
sns.pairplot(data_1,hue="rating")

In [None]:
sns.pairplot(data_1,hue="release_year")

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
selected_columns = ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
                     'release_year', 'rating', 'duration', 'listed_in', 'description']

data = netflix_data[selected_columns]

In [None]:
X = data_1.drop('type', axis=1) 
y = data_1['type'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)


## Logistic Regression

In [None]:

from sklearn.linear_model import LogisticRegression
selected_columns = ['type', 'rating'] 

data = netflix_data[selected_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)