## Importing the Dependencies

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

## Loading the data

In [None]:
# Loading the dataset
data = pd.read_csv('netflix_titles.csv')
data

## EDA

###Information of the dataset

In [None]:
data.info()

##### Here, we can see the dataset information such as the non-null data and the dtypeof the columns, Also, most of the columns has 8807 non-null data, while cast has 7982 non-null, country has 7976 non-null, date_added has 8797, rating and duration has 8803 and 8804 respectively.
The memory usage is 825.8+ KB

### Cleaning the dataset

In [None]:
# Checking for missing data
data.isnull().sum()

##### From the output above, we know that there are 8 columns with null values. ‘director’ column has 2634 null values, ‘cast’ has 825 null values, ‘country’ has 831 null values, ‘date_added’ has 10 null values, ‘rating’ has 4 null values, and ‘duration’ has 3 null values.

In [None]:
# Visualization of the missing values
sns.heatmap(data.isna())

Most of the values are missing from the Director, cast. Some data from country, date_added, rating and duration.

#### Taking Care of the Missing data

In [None]:
data.director.fillna('Not available', inplace=True)
data.cast.fillna('No cast', inplace=True)
data.country.fillna('Country Unavailable', inplace=True)
data.date_added.fillna('Not available', inplace=True)
data.rating.fillna('MISING', inplace=True)
data.duration.fillna('Unavailable', inplace=True)

In [None]:
data

In [None]:
data.isnull().sum()

Now, we have cleaned our dataset and it is ready to use

We can check if there is any duplicated values!!

In [None]:
# Lets check for duplicated value
dup_value=data.duplicated().sum()
print('The duplicated value is: ', dup_value)

### Visualization

In [None]:
data.type.value_counts()

In [None]:
sns.countplot('type', data=data)

In [None]:
plt.pie(data['type'].value_counts(), labels=(data.type.value_counts().index), autopct='%1.1f%%')

The plot above shows the total number of Movies and Tv shows in the dataset. Movies has 69.6% and TV shows has 30.4%.

This shows that More Movies are released compared to TV shows

### Top Countries producing Movies and Tv shows

In [None]:
Top_10_countries = data.set_index('type').country.str.split(',', expand=True).stack().reset_index(level=1,drop=True)
Top_10_countries = Top_10_countries[Top_10_countries != 'Country Unavailable']
sns.countplot(Top_10_countries, order= Top_10_countries.value_counts().index[:10])
plt.xticks(rotation=90)
plt.xlabel('Top_10_Countries')
plt.ylabel('Counts')
plt.title('TOP TEN COUNTRIES PRODUCING MOVIE OR TV_SHOWS')

This shows that the United States(US) leads in producing Movies and TV shows

In [None]:
Top_10_countries

### Top Directors producing Movie or TV shows

In [None]:
Top_directors = data.set_index('type').director.str.split(',', expand=True).stack().reset_index(level=1,drop=True)
Top_directors = Top_directors[Top_directors != 'Not available']
sns.countplot(Top_directors, order=Top_directors.value_counts().index[:10])
plt.xticks(rotation=90)
plt.xlabel("Top Directors")
plt.ylabel("Counts")
plt.title("TOP DIRECTORS PRODUCING MOVIES OR TV SHOWS")

This shows that the director Rajiv Chilaka directs more Movies than any other directions in the Movie and Tv show industry

In [None]:
Top_directors

### Top Actors producing Movies or TV shows

In [None]:
Top_Actors = data.set_index('type').cast.str.split(',', expand=True).stack().reset_index(level=1,drop=True)
Top_Actors = Top_Actors[Top_Actors != 'No cast']
sns.countplot(Top_Actors, order=Top_Actors.value_counts().index[:10])
plt.xticks(rotation=90)
plt.title("TOP ACTORS")
plt.xlabel("CAST")
plt.ylabel("Counts")

In [None]:
Top_Actors

### Dates at which Movies or Tv shows were added

In [None]:
Date_added = data.set_index('type').date_added.str.split(',', expand=True).stack().reset_index(level=1, drop=True)
sns.countplot(Date_added, order=Date_added.value_counts().index[:10])
plt.xticks(rotation=90)
plt.xlabel('Date Added')
plt.ylabel('Counts')
plt.title('DATE MOVIES OR TV SHOWS ADDED')

In [None]:
data.columns

### Correlation features

In [None]:
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [None]:
data = data.apply(labelencoder.fit_transform)

In [None]:
type(data)

In [None]:
data

In [None]:
corr = data.corr()

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(corr,vmin=-1,vmax=1, annot=True)

A value of +1 indicate that there is a strong positive correlation between the two variables, a value of -1 indicate a strong negative correlation between the variables and a value of 0 indicate there is no relationship.

In the map above, we can see that there is no correlation between the variables as all values are in between -0. and 0.