In [13]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Set global float formatting
pd.options.display.float_format = '{:,.2f}'.format

# Load the cleaned file of english movies
movies_data_to_load = Path("unique_english_movies_clean.csv")

# Read and store into Pandas DataFrames
movies_df = pd.read_csv(movies_data_to_load)
movies_df

Unnamed: 0,Name,Year,Score,Genre,Language,Budget,Revenue,Profit,Adventure,Comedy,...,History,Drama,War,Animation,Horror,Mystery,Romance,Science Fiction,Music,Thriller
0,The Passion of the Christ,2004-02-25,74.00,Drama,English,25000000.00,622313635.00,597313635.00,0,0,...,0,1,0,0,0,0,0,0,0,0
1,John Wick: Chapter 2,2017-05-18,73.00,"Action,Thriller,Crime",English,40000000.00,171539887.00,131539887.00,0,0,...,0,0,0,0,0,0,0,0,0,1
2,John Wick: Chapter 3 - Parabellum,2019-05-16,74.00,"Action,Thriller,Crime",English,55000000.00,326709727.00,271709727.00,0,0,...,0,0,0,0,0,0,0,0,0,1
3,A Bronx Tale,1994-05-20,78.00,"Drama,Crime",English,10000000.00,17287898.00,7287898.00,0,0,...,0,1,0,0,0,0,0,0,0,0
4,Avatar,2009-12-17,76.00,"Action,Adventure,Fantasy,Science Fiction",English,237000000.00,2923706026.00,2686706026.00,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4873,The Love Guru,2008-07-10,42.00,"Comedy,Romance",English,62000000.00,40159017.00,-21840983.00,0,1,...,0,0,0,0,0,0,1,0,0,0
4874,20th Century Women,2016-12-28,73.00,Drama,English,7000000.00,9353729.00,2353729.00,0,0,...,0,1,0,0,0,0,0,0,0,0
4875,Delta Force 2: The Colombian Connection,1990-08-24,54.00,Action,English,9145817.80,6698361.00,-2447456.80,0,0,...,0,0,0,0,0,0,0,0,0,0
4876,The Russia House,1990-12-21,61.00,"Drama,Thriller,Romance",English,21800000.00,22997992.00,1197992.00,0,0,...,0,1,0,0,0,0,1,0,0,1


In [14]:
# Retrieve Genres to build the list of headers
column_headers = movies_df.columns
print(column_headers)
len(column_headers)

Index(['Name', 'Year', 'Score', 'Genre', 'Language', 'Budget', 'Revenue',
       'Profit', 'Adventure', 'Comedy', 'Action', 'Documentary', 'Fantasy',
       'Western', 'Family', 'Crime', 'TV Movie', 'History', 'Drama', 'War',
       'Animation', 'Horror', 'Mystery', 'Romance', 'Science Fiction', 'Music',
       'Thriller'],
      dtype='object')


27

In [15]:
# Build Revenue DataFrame 
revenue_df = movies_df[["Revenue",
       "Action", "Romance", "Science Fiction", "Thriller", "Family",
       "Crime", "Documentary", "Drama", "Mystery", "TV Movie", "Music",
       "Adventure", "War", "Animation", "Western", "Comedy", "Horror",
       "Fantasy", "History"]]
revenue_df

Unnamed: 0,Revenue,Action,Romance,Science Fiction,Thriller,Family,Crime,Documentary,Drama,Mystery,TV Movie,Music,Adventure,War,Animation,Western,Comedy,Horror,Fantasy,History
0,622313635.00,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,171539887.00,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,326709727.00,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,17287898.00,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
4,2923706026.00,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4873,40159017.00,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4874,9353729.00,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4875,6698361.00,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4876,22997992.00,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [18]:
# Each row, search for the value 1 in each colum
search_columns = ["Revenue",
       "Action", "Romance", "Science Fiction", "Thriller", "Family",
       "Crime", "Documentary", "Drama", "Mystery", "TV Movie", "Music",
       "Adventure", "War", "Animation", "Western", "Comedy", "Horror",
       "Fantasy", "History"]

# Define replacement
replace_column = "Revenue"

# Iterate over the rows and columns to find and replace by the revenue
for index, row in revenue_df.iterrows():
    for column in search_columns:
        if row[column] ==1 :
            revenue_df.at[index, column] = row[replace_column]

#print(revenue_df)
revenue_df

# Write the matrix of revenue of English movies into CSV file
output_file_path_6 = "revenue_matrix_english_movies.csv"
revenue_df.to_csv(output_file_path_6, index=False)
print(f"The the matrix of revenue of English movies saved to {output_file_path_6}")

The the matrix of revenue of English movies saved to revenue_matrix_english_movies.csv


In [17]:
# Calculate the total revenue for each genres by sum of columns
# Initialize the empty set to store all sum values
column_sums = []

# Use FOR loop
for column in revenue_df.columns:
    column_sum = revenue_df[column].sum()
    column_sums.append(column_sum)
    
# DataFrame of total revenue
revenue_sums = pd.DataFrame([column_sums], columns=revenue_df.columns)
revenue_sums

Unnamed: 0,Revenue,Action,Romance,Science Fiction,Thriller,Family,Crime,Documentary,Drama,Mystery,TV Movie,Music,Adventure,War,Animation,Western,Comedy,Horror,Fantasy,History
0,1027709498609.2,352390207788.6,107332427775.8,193727057845.6,227185422531.0,272238932847.6,86507930874.0,38971176297.2,271188074504.0,92163799510.8,59791097252.6,31769450329.2,332297886893.0,22871382632.8,238144569901.8,7923841589.6,316231554170.8,130648907378.4,201182988488.4,33351318399.8


In [19]:
# Build Profit DataFrame 
profit_df = movies_df[["Profit",
       "Action", "Romance", "Science Fiction", "Thriller", "Family",
       "Crime", "Documentary", "Drama", "Mystery", "TV Movie", "Music",
       "Adventure", "War", "Animation", "Western", "Comedy", "Horror",
       "Fantasy", "History"]]
profit_df

Unnamed: 0,Profit,Action,Romance,Science Fiction,Thriller,Family,Crime,Documentary,Drama,Mystery,TV Movie,Music,Adventure,War,Animation,Western,Comedy,Horror,Fantasy,History
0,597313635.00,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,131539887.00,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,271709727.00,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,7287898.00,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
4,2686706026.00,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4873,-21840983.00,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4874,2353729.00,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4875,-2447456.80,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4876,1197992.00,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [20]:
# Each row, search for the value 1 in each colum
search_columns_profit = ["Profit",
       "Action", "Romance", "Science Fiction", "Thriller", "Family",
       "Crime", "Documentary", "Drama", "Mystery", "TV Movie", "Music",
       "Adventure", "War", "Animation", "Western", "Comedy", "Horror",
       "Fantasy", "History"]

# Define replacement
replace_column_profit = "Profit"

# Iterate over the rows and columns to find and replace by the profit
for index, row in profit_df.iterrows():
    for column in search_columns_profit:
        if row[column] ==1 :
            profit_df.at[index, column] = row[replace_column_profit]

#print(revenue_df)
profit_df

# Write the matrix of profit of English movies into CSV file
output_file_path_7 = "profit_matrix_english_movies.csv"
profit_df.to_csv(output_file_path_7, index=False)
print(f"The the matrix of profit of English movies saved to {output_file_path_7}")

The the matrix of profit of English movies saved to profit_matrix_english_movies.csv


In [21]:
# Calculate the total profit for each genres by sum of columns
# Initialize the empty set to store all sum values
column_sums_profit = []

# Use FOR loop
for column in profit_df.columns:
    column_sum = profit_df[column].sum()
    column_sums_profit.append(column_sum)
    
# DataFrame of total revenue
profit_sums = pd.DataFrame([column_sums_profit], columns=profit_df.columns)
profit_sums

Unnamed: 0,Profit,Action,Romance,Science Fiction,Thriller,Family,Crime,Documentary,Drama,Mystery,TV Movie,Music,Adventure,War,Animation,Western,Comedy,Horror,Fantasy,History
0,749477046812.4,247752190148.6,78083046311.8,140959545289.0,159218454179.4,201950133875.4,57855195374.0,31867980142.4,192296677597.0,67064684190.0,47416457074.6,24313748293.0,239833562116.0,15018472575.6,181011112135.0,5054203346.0,230141560287.2,98647229073.4,147158507379.8,22608247578.8
