In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('datasets/data_set.csv')

# <font color="#5C372C">Data Pre-Processing</font>

In [None]:
df.head(2)

In [None]:
df.isna()

In [None]:
df.isna().count()

<font color=orange>The data has no null entry.</font>

### Data Filtering

In [None]:
df.head(2)

##### <font color=red>Check for unique backdrop_path</font>

In [None]:
df['backdrop_path'].nunique()

##### <font color=red>There is only one(1) unique backdrop_path for every movie so removing this column is necessary.</font>

In [None]:
df = df.drop('backdrop_path', axis=1)

##### <font color="red">Dropping unnecessary columns</font>
<table border="1" style = "
                text-align: center;
                background-color: #e04b26;
                color: white;
">
  <tr>
    <th>Columns to be dropped</th>
    <th>Reasons</th>
  </tr>
  <tr>
    <td>'Unnamed: 0'</td>
    <td>Because that is just indexing from the raw CSV file.</td>
  </tr>
  <tr>
    <td>'crew'</td>
    <td>Because viewers' major focus is on the cast, the actors involved in the movie. So it is reasonable to drop the 'crew' column.</td>
  </tr>
  <tr>
    <td>'title'</td>
    <td>Because in almost every case, original_title is equal to the title in the dataframe.</td>
  </tr>
  <tr>
    <td>'video'</td>
    <td>Because it does not matter whether there is a video/trailer for the a particular movie(row).</td>
  </tr>
</table>


In [None]:
df = df.drop(['Unnamed: 0', 'id', 'video', 'crew', 'title'], axis=1)
# Introduce a row_id column
row_id = [x for x in range(0, df.shape[0])]
df['row_id'] = row_id

##### <font color=orange>It is better to rename some columns to reflect its usage in the database.</font>
<table border="1" style = "
                text-align: center;
                background-color: #e38622;
                color: white;
">
  <tr>
    <th>Columns to be renamed (from)</th>
    <th>Renamed value (to)</th>
  </tr>
  <tr>
    <td>'genre_ids'</td>
    <td>genres</td>
  </tr>
  <tr>
    <td>'original_language'</td>
    <td>language</td>
  </tr>
  <tr>
    <td>'original_title'</td>
    <td>title</td>
  </tr>
  <tr>
    <td>'release_date' (performed in the next to next cell)</td>
    <td>release_year</td>
  </tr>
</table>


In [None]:
df = df.rename(columns={'genre_ids': 'genres', 'original_language': 'language', 'original_title': 'title'})

In [None]:
# Convert 'release_date' to datetime and extract year
df['release_year'] = pd.to_datetime(df['release_date']).dt.year

# Drop the original 'release_date' column
df.drop(columns=['release_date'], inplace=True)

# <font color="#5C372C">Segregation</font>

## Format 'genres' and 'keywords' Series for the DataFrame(df)

In [None]:
genre_comma_separated_string = df['genres']

In [None]:
import re

def clean_string(text):
    return text.replace('[', '').replace(']', '').replace("'", '')

# Apply the function to each element in the Series
genre_comma_separated_string = genre_comma_separated_string.apply(clean_string)

In [None]:
genre_comma_separated_string_df = pd.DataFrame(genre_comma_separated_string)
genre_comma_separated_string_df

##### Replace the genres and keywords list format in the data frame with genres string

In [None]:
df['genres'] = genre_comma_separated_string_df

## Generate 'keywords' dataset. Keywords column contains only keywords and not there associated IDs as it was provided in the original Dataset.

In [None]:
df.head(2)

In [None]:
import re

keywords_column = []
keywords_ids = []
id_name_mappings = []

def getMappings(list_of_json_array_in_string):
    # The JSON pattern which wil extract the JSON elements from a string of combined JSONs
    json_pattern = r'\{[^{}]+\}'
    for json_array_in_string in list_of_json_array_in_string:
        list_of_json_elements_in_string = re.findall(json_pattern, json_array_in_string)
        # set an empty list
        keywords_ids = []
        # Generate id->name mappings
        for s in list_of_json_elements_in_string:
            id_name_mappings.append(
                {'id': int(s[7:s.find(',')]), 'name': s[s.find(',')+11:-2]}
            )
            keywords_ids.append(s[s.find(',')+11:-2])
        keywords_column.append(keywords_ids[:-2])

In [None]:
l_df_k = df['keywords'].to_list() # array of id's and names for the keywords dataset

In [None]:
# Convert the list of JSON array to dict
getMappings(l_df_k)

In [None]:
id_name_mappings[3]

### <font color=teal>Set the keywords column(Mandatory)</br>Generate Dataframe and Export CSV(Optional for maintaining multiple tables)</font>

In [None]:
df['keywords'] = keywords_column
keywords_dataframe = pd.DataFrame(id_name_mappings)
keywords_dataframe.columns = ['id', 'name']
# Export(Optional)
keywords_dataframe.to_csv('keywords_dataset.csv', index=False)

## Generate cast dataset.

In [None]:
df.head(2)

In [None]:
def getFilteredCastText(t):
    return t[t.find("'known_for_department'"):t.find(", 'popularity")] +", " + t[t.find("'character'"):t.find(", 'credit_id'")]

# demonstration of what getFileteredCastText can do
getFilteredCastText(df['cast'][0]).split("'")

In [None]:
from pandas import Series
import re

# Every row has one or more than cast info
# 1) Extract the entire cast row as a string
# 2) Split the entire string obtained in step 1 to form multiple cast strings (as there can be more than one cast in a movie)
# 3) Split each of the cast's string in such a way so to obtain 'known_for_department', 'name', 'original_name' and 'character'
# 4) Append all of this info obtained in step 3 to a single dimension list
# 5) Append this list to a two dimension list. This two-dimension list indicates the list of vital informations of all the casts in the movie
# 6) Append the two dimension list to a third dimension list. This third dimension list contains the two dimension lists of all the casts in a particular movie(row). So the third dimension list is the list 
# of vital information of all casts of each movie(row).

list_movies_casts_info = [] # List of all casts's vital information from each movie(row)

def getAllCasts(series: Series):
    pattern = r'{[^}]*}'
    all_casts = df['cast'].tolist()
    # replace all \' with '
    row_id = 0
    for casts in all_casts:
        casts = casts[1:-1] # Remove the square brackets from start and end
        casts = casts.replace("\'", "'")
        row_cast_info_list = []
        for single_cast in re.findall(pattern, casts):
            single_cast_info_dict = dict()
            # Get the filtered text
            filtered_text = getFilteredCastText(single_cast)
            # Extract vital information - known_for_department, name, original name, character
            split_info = filtered_text.split("'")
            known_for_department = split_info[3]
            name = split_info[7]
            original_name = split_info[11]
            character = split_info[-2]
            single_cast_info_dict.update({'row_id': row_id+1})
            single_cast_info_dict.update({'known_for_department': known_for_department})
            single_cast_info_dict.update({'name': name})
            single_cast_info_dict.update({'original_name': original_name})
            single_cast_info_dict.update({'character': character})
            row_cast_info_list.append(single_cast_info_dict)
        list_movies_casts_info.append(row_cast_info_list)
        row_id = row_id + 1

In [None]:
getAllCasts(df['cast'])

In [None]:
list_movies_casts_info[1]

### Set the cast column and export the dataset

In [None]:
# flattening tool
from itertools import chain

# We are flattening to store the column in the .csv file (as the data will take raw form) -- Remember that `row_id` is the primary key for the main table
flattened_list = list(chain.from_iterable(list_movies_casts_info))
df['cast'] = list_movies_casts_info
cast_dataframe = pd.DataFrame(flattened_list)
cast_dataframe.columns = ['common_row_id', 'known_for_department', 'name', 'original_name', 'character']
# Export(Optional)
cast_dataframe.to_csv('cast_dataset.csv', index=False)

In [None]:
df.head(2)

## Generate genres dataset.

In [None]:
df.head(1)

In [None]:
# Split the 'genres' column by commas
df['genres'] = df['genres'].str.split(', ')

# Create a new DataFrame with duplicate 'c_id' values for each genre
genre_list = []
for index, row in df.iterrows():
    g_id = index+1
    genres = row['genres']
    for genre in genres:
        genre_list.append([g_id, genre])

genre_df = pd.DataFrame(genre_list, columns=['g_id', 'genre'])

# Reset the index
genre_df = genre_df.reset_index(drop=True)

In [None]:
genre_df

In [None]:
df

# <font color="#5C372C">Analysis</font>

## Frequency plot on basis of original_language

In [None]:
df['language'].unique()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(20, 5))
plt.hist(df['language'], color='orange', ec='blue')

#### <font color="#FFE5B4">Hence maximum movies are either in <em>en</em>, <em>hi</em>, <em>ja</em>, <em>ko</em>, or <em>es</em>.</font>

## Frequency plot on basis of adult rated movie

In [None]:
adult_movies = df['adult']
true_count = adult_movies.to_list().count(True)
false_count = adult_movies.to_list().count(False)
labels = ['True', 'False']
counts = [true_count, false_count]

In [None]:
plt.pie(counts, labels=labels, autopct='%1.1f%%', colors=['red', 'green'])
plt.title('True/False Counts')
plt.show()

(adult_movies == False).count() # Number of adult rated movies

#### <font color="#FFE5B4">Hence no movies are adult rated.</font>

## Plot of popularity ( numbers )

In [None]:
plt.plot(df['popularity'])

#### So our target is to scale down these to fit on the scale of 100

In [None]:
# obtain a list of those values
pop_list = df['popularity']
# get the min and max values from the list
min = pop_list.min()
max = pop_list.max()

In [None]:
min, max

##### In the following cell we scale the numbers present in the `pop_list` on a scale of 0 to 100
##### If [a, b] = [0, 100] and [min(x), max(x)] = [min, max] then the normalized value of x can be computed using formula -
##### $ x_{normalized} = \,{(b - a)}\frac{(x - min(x))}{max(x) - min(x)} + a $

In [None]:
pop_list

In [None]:
r_min = min
r_max = max
t_min = 0
t_max = 100

pop_list = [
    ((((m - r_min) / (r_max - r_min)) * (t_max - t_min)) + t_min) for m in pop_list
]

In [None]:
pop_list

#### Put this list to the popularity column

In [None]:
df['popularity'] = pop_list

In [None]:
df.head(2)

## Average vote analysis

In [None]:
import numpy as np
l = df['vote_average'].tolist()
np.min(l), np.max(l)

# Make the variables/functions visible to populate the database using another notebook (populate_database.ipynb)

In [None]:
%store df
%store keywords_column
%store list_movies_casts_info
%store genre_df

# <font color="#5C372C">Conclusion</font>
##### To enhance the accuracy of the ratings, average vote can be put on a scale of 10 and newer votes can be adjusted dynamically using scaling algorithms through the main program which accesses the database.
##### Hence data-driven decision-making has to be prioritized to determine the appropriate scale for average votes, ensuring a rigorous and reliable evaluation process.

# <font color="#5C372C">Final DataFrame description</font>
<table style = "
                text-align: center;
                background-color: #5C372C;
                color: #BE9F96;
">
    <tr>
        <th>adult</th>
        <th>genres</th>
        <th>language</th>
        <th>title</th>
        <th>overview</th>
        <th>popularity</th>
        <th>poster_path</th>
        <th>vote_average</th>
        <th>vote_count</th>
        <th>keywords</th>
        <th>cast</th>
        <th>row_id</th>
        <th>release_year</th>
    </tr>
    <tr>
        <td>Indicates whether the movie is adult rated or not</td>
        <td>Collection of one or more genre types</td>
        <td>Original Language in which the movie was made</td>
        <td>Original Title of the movie</td>
        <td>Overview of the movie</td>
        <td>Popularity Score of the movie on a scale of 0 to 100</td>
        <td>Link to the poster of the movie</td>
        <td>Average Vote given by the viewers who have watched/not watched the movie. Average Vote is observed on a scale of 0 to 10</td>
        <td>Total Vote count by the viewers who have watched/not watched the movie</td>
        <td>The keywords associated with a movie. Viewers can search by keywords. Note that keywords column are not to be used in textual analysis i.e. if user wants to watch a movie and he types his intent/choice/sentence then keywords column is not to be used for analysis. Rather overview should be used</td>
        <td>Cast</td>
        <td>Primary Key for the main table (0-indexed AUTO INCREMENT)</td>
        <td>Release year of the movie</td>
    </tr>
</table>