#**Data Understanding & Data Preparation**

In [None]:
import pandas as pd

Menganalisis faktor-faktor yang memengaruhi kesuksesan box office film anime, langkah pertama yang harus dilakukan adalah mengumpulkan data dari kaggle yang telah saya downloand sebelumnya, yaitu sebagai berikut:

In [None]:
df = pd.read_csv("/content/drive/MyDrive/AVD MINPRO 1/top_1000_animes.csv")

df

Unnamed: 0,anime_id,anime_name,genres,type,number_of_episodes,rating,total_views,production_cost,total_box_office
0,16498,Attack on Titan,"Action, Drama, Fantasy, Mystery",TV,25,84,778095,200000,30.8
1,101922,Demon Slayer,"Action, Adventure, Drama, Fantasy, Supernatural",TV,26,83,735876,250000,504.5
2,1535,DEATH NOTE,"Mystery, Psychological, Supernatural, Thriller",TV,37,84,708493,100000,30.0
3,113415,Jujutsu Kaisen,"Action, Drama, Supernatural",TV,24,85,677899,250000,196.0
4,21459,My Hero Academia,"Action, Adventure, Comedy",TV,13,77,672551,200000,55.0
...,...,...,...,...,...,...,...,...,...
995,97663,Knights & Magic,"Action, Fantasy, Mecha",TV,13,67,52265,150000,33.0
996,136707,Isekai Yakkyoku,Fantasy,TV,12,72,52175,250000,35.0
997,129192,Tensei Kenja no Isekai Life: Daini no Shokugyo...,"Action, Adventure, Comedy, Fantasy",TV,12,61,52116,300000,39.0
998,127271,Ryuu to Sobakasu no Hime,"Drama, Music, Mystery, Sci-Fi",MOVIE,1,73,52055,300000,56.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   anime_id              1000 non-null   int64  
 1   anime_name            1000 non-null   object 
 2   genres                1000 non-null   object 
 3   type                  1000 non-null   object 
 4   number_of_episodes    1000 non-null   int64  
 5   rating                1000 non-null   int64  
 6   total_views           1000 non-null   int64  
 7   production_cost       1000 non-null   int64  
 8   total_box_office      1000 non-null   float64
 9   production_cost_jpy   1000 non-null   int64  
 10  total_box_office_jpy  1000 non-null   float64
dtypes: float64(2), int64(6), object(3)
memory usage: 86.1+ KB


## **Data Cleaning**

 Setelah data terkumpul, data tersebut perlu dibersihkan dengan membuang data yang tidak lengkap atau tidak valid dengan data cleaning yang merupakan proses menghapus atau memodifikasi data yang tidak lengkap, duplikat, tidak akurat, dan salah format. Data-data tersebut dihapus atau dimodifikasi untuk memastikan data yang sedang diolah adalah data berkualitas agar dapat menghasilkan keputusan yang lebih akurat.

###Melihat Missing

Melakukan pengecekan missing values perkolom dalam bentuk presentase, guna mendukung pengambilan keputusan pada nilai yang kosong

In [None]:
print((df.isna().sum() / len(df)) * 100)

anime_id              0.0
anime_name            0.0
genres                0.0
type                  0.0
number_of_episodes    0.0
rating                0.0
total_views           0.0
production_cost       0.0
total_box_office      0.0
dtype: float64


In [None]:
df.isnull().sum()

Unnamed: 0,0
anime_id,0
anime_name,0
genres,0
type,0
number_of_episodes,0
rating,0
total_views,0
production_cost,0
total_box_office,0
production_cost_jpy,0


In [None]:
df.describe()

Unnamed: 0,anime_id,number_of_episodes,rating,total_views,production_cost,total_box_office
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,63084.119,19.926,74.667,136345.578,201400.0,35.3753
std,54315.227932,57.600941,7.371733,106097.155245,68649.492007,40.570578
min,1.0,1.0,45.0,52006.0,100000.0,7.0
25%,14501.5,12.0,70.0,69314.0,150000.0,20.0
50%,21693.0,12.0,75.0,97868.5,200000.0,33.0
75%,110880.0,13.0,80.0,158302.25,250000.0,46.0
max,166873.0,1110.0,91.0,778095.0,300000.0,1001.0


 **Duplicated Values**

In [None]:
df.duplicated().sum()

0

 **Outliers**

In [None]:
results = []

cols = df.select_dtypes(include=['float64', 'int64'])

for col in cols:
  q1 = df[col].quantile(0.25)
  q3 = df[col].quantile(0.75)
  iqr = q3 - q1
  lower_bound = q1 - 1.5*iqr
  upper_bound = q3 + 1.5*iqr
  outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
  percent_outliers = (len(outliers)/len(df))*100
  results.append({'Kolom': col, 'Persentase Outliers': percent_outliers})

# Dataframe dari list hasil
results_df = pd.DataFrame(results)
results_df.set_index('Kolom', inplace=True)
results_df = results_df.rename_axis(None, axis=0).rename_axis('Kolom', axis=1)

# Tampilkan dataframe
display(results_df)

Kolom,Persentase Outliers
anime_id,0.0
number_of_episodes,41.5
rating,0.8
total_views,8.4
production_cost,0.0
total_box_office,0.0


In [None]:
columns_to_impute = ["production_cost", "total_box_office"]

for col in columns_to_impute:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Menggunakan .loc[] agar tidak muncul SettingWithCopyWarning
    df.loc[:, col] = df[col].clip(lower=lower_bound, upper=upper_bound)


In [None]:
results = []

cols = df.select_dtypes(include=['float64', 'int64'])

for col in cols:
  q1 = df[col].quantile(0.25)
  q3 = df[col].quantile(0.75)
  iqr = q3 - q1
  lower_bound = q1 - 1.5*iqr
  upper_bound = q3 + 1.5*iqr
  outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
  percent_outliers = (len(outliers)/len(df))*100
  results.append({'Kolom': col, 'Persentase Outliers': percent_outliers})

# Dataframe dari list hasil
results_df = pd.DataFrame(results)
results_df.set_index('Kolom', inplace=True)
results_df = results_df.rename_axis(None, axis=0).rename_axis('Kolom', axis=1)

# Tampilkan dataframe
display(results_df)

Kolom,Persentase Outliers
anime_id,0.0
number_of_episodes,41.5
rating,0.8
total_views,8.4
production_cost,0.0
total_box_office,0.0


**Incosistent Value**

In [None]:
df.describe()

Unnamed: 0,anime_id,number_of_episodes,rating,total_views,production_cost,total_box_office
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,63084.119,19.926,74.667,136345.578,201400.0,35.3753
std,54315.227932,57.600941,7.371733,106097.155245,68649.492007,40.570578
min,1.0,1.0,45.0,52006.0,100000.0,7.0
25%,14501.5,12.0,70.0,69314.0,150000.0,20.0
50%,21693.0,12.0,75.0,97868.5,200000.0,33.0
75%,110880.0,13.0,80.0,158302.25,250000.0,46.0
max,166873.0,1110.0,91.0,778095.0,300000.0,1001.0


## **Construct Data**

In [None]:
df["profit_margin"] = (df["total_box_office"] - df["production_cost"]) / df["production_cost"]
df

Unnamed: 0,anime_id,anime_name,genres,type,number_of_episodes,rating,total_views,production_cost,total_box_office,profit_margin
0,16498,Attack on Titan,"Action, Drama, Fantasy, Mystery",TV,25,84,778095,200000.0,30.8,-0.999846
1,101922,Demon Slayer,"Action, Adventure, Drama, Fantasy, Supernatural",TV,26,83,735876,250000.0,85.0,-0.999660
2,1535,DEATH NOTE,"Mystery, Psychological, Supernatural, Thriller",TV,37,84,708493,100000.0,30.0,-0.999700
3,113415,Jujutsu Kaisen,"Action, Drama, Supernatural",TV,24,85,677899,250000.0,85.0,-0.999660
4,21459,My Hero Academia,"Action, Adventure, Comedy",TV,13,77,672551,200000.0,55.0,-0.999725
...,...,...,...,...,...,...,...,...,...,...
995,97663,Knights & Magic,"Action, Fantasy, Mecha",TV,13,67,52265,150000.0,33.0,-0.999780
996,136707,Isekai Yakkyoku,Fantasy,TV,12,72,52175,250000.0,35.0,-0.999860
997,129192,Tensei Kenja no Isekai Life: Daini no Shokugyo...,"Action, Adventure, Comedy, Fantasy",TV,12,61,52116,300000.0,39.0,-0.999870
998,127271,Ryuu to Sobakasu no Hime,"Drama, Music, Mystery, Sci-Fi",MOVIE,1,73,52055,300000.0,56.0,-0.999813


## **Data Reduction**

In [None]:
if 'anime_id' in df.columns:
    df.drop("anime_id", axis=1, inplace=True)
else:
    print("Column 'anime_id' not found in the DataFrame.")

df

Column 'anime_id' not found in the DataFrame.


Unnamed: 0,anime_name,genres,type,number_of_episodes,rating,total_views,production_cost,total_box_office,profit_margin
0,Attack on Titan,"Action, Drama, Fantasy, Mystery",TV,25,84,778095,200000.0,30.8,-0.999846
1,Demon Slayer,"Action, Adventure, Drama, Fantasy, Supernatural",TV,26,83,735876,250000.0,85.0,-0.999660
2,DEATH NOTE,"Mystery, Psychological, Supernatural, Thriller",TV,37,84,708493,100000.0,30.0,-0.999700
3,Jujutsu Kaisen,"Action, Drama, Supernatural",TV,24,85,677899,250000.0,85.0,-0.999660
4,My Hero Academia,"Action, Adventure, Comedy",TV,13,77,672551,200000.0,55.0,-0.999725
...,...,...,...,...,...,...,...,...,...
995,Knights & Magic,"Action, Fantasy, Mecha",TV,13,67,52265,150000.0,33.0,-0.999780
996,Isekai Yakkyoku,Fantasy,TV,12,72,52175,250000.0,35.0,-0.999860
997,Tensei Kenja no Isekai Life: Daini no Shokugyo...,"Action, Adventure, Comedy, Fantasy",TV,12,61,52116,300000.0,39.0,-0.999870
998,Ryuu to Sobakasu no Hime,"Drama, Music, Mystery, Sci-Fi",MOVIE,1,73,52055,300000.0,56.0,-0.999813


In [None]:
df.describe()

Unnamed: 0,number_of_episodes,rating,total_views,production_cost,total_box_office,profit_margin
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,19.926,74.667,136345.578,201400.0,33.2625,-0.999812
std,57.600941,7.371733,106097.155245,68649.492007,15.974517,0.000117
min,1.0,45.0,52006.0,100000.0,7.0,-0.999977
25%,12.0,70.0,69314.0,150000.0,20.0,-0.9999
50%,12.0,75.0,97868.5,200000.0,33.0,-0.999836
75%,13.0,80.0,158302.25,250000.0,46.0,-0.99975
max,1110.0,91.0,778095.0,300000.0,85.0,-0.99941
