In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

## Loading Datasets

In [None]:
#12 columns X 8807
netflix = pd.read_csv ('/Users/lailaalmajnuni/Downloads/EDA/netflix_titles.csv', 
                       skipinitialspace = True)

#loading only 2 columns out of 49 X 85855
votes = pd.read_csv ('/Users/lailaalmajnuni/Downloads/EDA/IMDb_ratings.csv', skipinitialspace = True, 
                       usecols=['weighted_average_vote', 'imdb_title_id'])
#votes.info()

#loading 2 columns out of 22 X 85855
names = pd.read_csv ('/Users/lailaalmajnuni/Downloads/EDA/IMDb_movies.csv', skipinitialspace = True,
                     usecols=['original_title', 'imdb_title_id'])
#names.info()
names.head(10)

## Joining IMDb Datasets

In [None]:
netflix['title'].str.strip()
netflix['title'].str.lower()
names['original_title'].str.strip()
names['original_title'].str.lower()

joined_IMDb = pd.merge(votes, names, how ='inner', on =['imdb_title_id'])
joined_IMDb.drop(['imdb_title_id'], axis = 1, inplace = True)
joined_IMDb.rename(columns = {'original_title':'title'}, inplace=True)
#joined_IMDb.info()

## Your Final Dataset To Work With
##### Joining the IMDb dataset with the Netflix dataset and clean the data


In [None]:
inner_final = pd.merge(joined_IMDb, netflix, how = 'inner', on =['title'])
inner_final.rename(columns = {'listed_in':'genre'}, inplace=True)
inner_final.rename(columns = {'weighted_average_vote':'votes'}, inplace=True)

inner_final.drop(['show_id', 'director', 'cast', 'description', 'duration'], 
                 axis = 1, inplace = True)
inner_final.drop_duplicates(subset=['title'], inplace=True)

inner_final.country.fillna("Country Unavailable", inplace=True)

inner_final.dropna(subset=["rating"], inplace=True)
inner_final.sort_values('votes', inplace=True)
inner_final.info()
#final dataset has a size of 8 X 3177

## Just to get a sense of the data distribution

In [None]:
inner_final.hist(column='votes', color = '#9458D6')
#votes are normally distributed

plt.title('Rating Distribution of Netflix Content', fontsize = 18, fontweight = 'bold')
plt.xlabel('Rating out of 10', fontsize = 15, fontweight = 'bold')
plt.ylabel('Count of Rating', fontsize = 15, fontweight = 'bold')
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.figure(figsize=(15,15))

plt.savefig('Votes.png')

In [None]:
inner_final.hist(column='release_year', color = '#CC5D8B')
#release year is exponentially distributed


plt.title('Added Content by Year Distribution', fontsize = 18, fontweight = 'bold')
plt.xlabel('Years', fontsize = 15, fontweight = 'bold')
plt.ylabel('Count of Content per Year', fontsize = 15, fontweight = 'bold')
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.figure(figsize=(15,15))

plt.savefig('Year.png')

## Country-wise Diversity

In [None]:
country_wise =( inner_final.set_index('title').country.str.split(', ', expand=True).stack()
                .reset_index(level=1, drop=True) )
country_wise = country_wise[country_wise != 'Country Unavailable']

plt.figure(figsize=(20,10))

plt.title('Top 5 Countries by Content on Netflix', fontsize = 30, fontweight = 'bold')

Country_wise = sns.countplot(x = country_wise, order=country_wise.value_counts().index[:5])

plt.xlabel('Country', fontsize = 25, fontweight = 'bold')
plt.ylabel('Count of Content', fontsize = 25, fontweight = 'bold')
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)

plt.savefig('Country_wise.png')

## TV Shows Vs. Movies

In [None]:
inner_final['type'].isnull().sum()

plt.figure(figsize=(10, 10))
mycolors = ['#FF9F9F', '#A09FFF']

TM = plt.pie(inner_final.type.value_counts(), 
        labels = inner_final.type.value_counts().index,
        autopct = lambda x: '{:.1f} % ({:.0f})'.format(x,(x/100)*inner_final['type'].count()),
        textprops = {'fontsize': 24}, colors = mycolors)

plt.savefig('TM.png')

## When is it the best month to subscribe?

In [None]:
netflix_date = inner_final.dropna(subset=['date_added'])
netflix_date['year'] = netflix_date['date_added'].apply(lambda x : x.split(', ')[-1])
netflix_date['month'] = netflix_date['date_added'].apply(lambda x : x.lstrip().split(' ')[0])
netflix_date = netflix_date.astype({'year' : int})
netflix_date = netflix_date[netflix_date.year > 2017] 
netflix_date = netflix_date[netflix_date.year < 2021]

plt.figure(figsize=(15, 15))
plt.title('Which Month to Subscribe?', fontsize = 24,
         fontweight = 'bold')
sns.set(style="darkgrid")

When = sns.countplot(x = 'month', data = netflix_date, 
                  order = netflix_date['month'].value_counts().index, palette = 'pastel')

plt.xlabel('Month', fontsize = 20, fontweight = 'bold')
plt.ylabel('Number of Listed Shows', fontsize = 20, fontweight = 'bold')
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)

plt.savefig('When.png')

## 10 Most common Genres on Netflix

In [None]:
#spliting the genre column because is has a string of multiple genres.
filtering_genre =( inner_final.drop('genre', axis = 1)
            .join(inner_final['genre'].str.split(', ', expand = True)
                  .stack().reset_index(level = 1, drop = True).rename('genre')))

plt.figure(figsize=(12, 8))
plt.title('10 Most Common Genres on Netflix', fontsize = 24,
         fontweight = 'bold')
sns.set(style="darkgrid")

Common_Genre = sns.countplot(y = 'genre', order = filtering_genre.genre.value_counts().index[:10], 
                   data = filtering_genre, palette = 'Spectral')

plt.xlabel('Number of Listed Shows', fontsize = 20)
plt.ylabel('Genres', fontsize = 20)
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)

plt.savefig('Common_Genre.png')

## Removing the first two quantile
#### only 50% of the dataset with the higher rating with the top 10 genres because we need the maxima that means these shows are great

In [None]:
df = filtering_genre
df = df.loc[df.genre.isin(['Dramas', 'International Movies', 'Comedies', 'Independent Movies', 'Action & Adventure',
                          'Thrillers', 'Romantic Movies'])]
lower_quantile , upper_quantile = df.votes.quantile([.5, 1])
top_five = df.loc[(df.votes > lower_quantile)]
print(top_five.groupby('genre').agg(['min', 'max', 'median', 'mean', 'std', 'count']))


## Ploting the rating boxplot
#### to see if there is difference on the avrege across genres

In [None]:
plt.figure(figsize=(30, 20))
plt.title('Avrage Vewiers Rating of the 7 Most \nListed Genres On Netflix', fontsize = 35,
         fontweight = 'bold')
sns.set(style="darkgrid")
plt.ylim(5, 10)

Rating_Genre = sns.boxplot(y = 'votes', x = 'genre', data = top_five, palette = 'colorblind')

plt.xlabel('Genres', fontsize = 30, fontweight = 'bold')
plt.ylabel('Avrage Ratings', fontsize = 30, fontweight = 'bold')
plt.xticks(fontsize = 28)
plt.yticks(fontsize = 28)

plt.savefig('Rating_Genre.png')

## Recommendations

In [None]:
df['release_year'] = df.release_year.astype('float')

firstfilter = df.loc[df['release_year'] > 2015]
secondfilter = firstfilter.loc[firstfilter['country'] != 'India']
recommendations = secondfilter.groupby('genre', 
                                       group_keys = False).apply(pd.DataFrame.nlargest,
                                                                 n = 3, columns = 'votes')
print(recommendations)