In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from sklearn.tree import DecisionTreeClassifier,export_text,plot_tree
from sklearn import metrics
from yellowbrick.classifier import ROCAUC
from yellowbrick.model_selection import ValidationCurve
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the IMDB dataset
df = pd.read_csv('/Users/jarkrunglerdkriangkrai/ISE/Year 4/Semester 2/Big Data/new_github/big_data_ai/imdp.csv',encoding='latin-1')

# Display the first five rows of the dataset
print(df.head())


       director_name  num_critic_for_reviews  duration  \
0      James Cameron                   723.0     178.0   
1     Gore Verbinski                   302.0     169.0   
2         Sam Mendes                   602.0     148.0   
3  Christopher Nolan                   813.0     164.0   
4        Doug Walker                     NaN       NaN   

   director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                      0.0                   855.0  Joel David Moore   
1                    563.0                  1000.0     Orlando Bloom   
2                      0.0                   161.0      Rory Kinnear   
3                  22000.0                 23000.0    Christian Bale   
4                    131.0                     NaN        Rob Walker   

   actor_1_facebook_likes        gross                           genres  \
0                  1000.0  760505847.0  Action|Adventure|Fantasy|Sci-Fi   
1                 40000.0  309404152.0         Action|Adventure|Fant

In [3]:
# Display the information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   director_name              4939 non-null   object 
 1   num_critic_for_reviews     4993 non-null   float64
 2   duration                   5028 non-null   float64
 3   director_facebook_likes    4939 non-null   float64
 4   actor_3_facebook_likes     5020 non-null   float64
 5   actor_2_name               5030 non-null   object 
 6   actor_1_facebook_likes     5036 non-null   float64
 7   gross                      4159 non-null   float64
 8   genres                     5043 non-null   object 
 9   actor_1_name               5036 non-null   object 
 10  movie_title                5043 non-null   object 
 11  num_voted_users            5043 non-null   int64  
 12  cast_total_facebook_likes  5043 non-null   int64  
 13  actor_3_name               5020 non-null   objec

In [4]:
# Display the statistics of the dataset
print(df.describe())

       num_critic_for_reviews     duration  director_facebook_likes  \
count             4993.000000  5028.000000              4939.000000   
mean               140.194272   107.201074               686.509212   
std                121.601675    25.197441              2813.328607   
min                  1.000000     7.000000                 0.000000   
25%                 50.000000    93.000000                 7.000000   
50%                110.000000   103.000000                49.000000   
75%                195.000000   118.000000               194.500000   
max                813.000000   511.000000             23000.000000   

       actor_3_facebook_likes  actor_1_facebook_likes         gross  \
count             5020.000000             5036.000000  4.159000e+03   
mean               645.009761             6560.047061  4.846841e+07   
std               1665.041728            15020.759120  6.845299e+07   
min                  0.000000                0.000000  1.620000e+02   
25%  

In [5]:
# Display the value counts of the 'imdb_score' column
print(df['imdb_score'].value_counts())

6.7    223
6.6    201
7.2    195
6.5    186
6.4    185
      ... 
9.5      1
1.6      1
9.3      1
1.7      1
9.2      1
Name: imdb_score, Length: 78, dtype: int64


In [6]:
# Check for missing values
print(df.isna().sum())

director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64


In [7]:
print(df)

          director_name  num_critic_for_reviews  duration  \
0         James Cameron                   723.0     178.0   
1        Gore Verbinski                   302.0     169.0   
2            Sam Mendes                   602.0     148.0   
3     Christopher Nolan                   813.0     164.0   
4           Doug Walker                     NaN       NaN   
...                 ...                     ...       ...   
5038        Scott Smith                     1.0      87.0   
5039                NaN                    43.0      43.0   
5040   Benjamin Roberds                    13.0      76.0   
5041        Daniel Hsia                    14.0     100.0   
5042           Jon Gunn                    43.0      90.0   

      director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                         0.0                   855.0  Joel David Moore   
1                       563.0                  1000.0     Orlando Bloom   
2                         0.0             

# Preprocess

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [9]:
# discretize target variable into high or low
df['imdb_score'] = df['imdb_score'].apply(lambda score: 'high' if score >= 7 else 'low')

In [10]:
print(df)

          director_name  num_critic_for_reviews  duration  \
0         James Cameron                   723.0     178.0   
1        Gore Verbinski                   302.0     169.0   
2            Sam Mendes                   602.0     148.0   
3     Christopher Nolan                   813.0     164.0   
4           Doug Walker                     NaN       NaN   
...                 ...                     ...       ...   
5038        Scott Smith                     1.0      87.0   
5039                NaN                    43.0      43.0   
5040   Benjamin Roberds                    13.0      76.0   
5041        Daniel Hsia                    14.0     100.0   
5042           Jon Gunn                    43.0      90.0   

      director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                         0.0                   855.0  Joel David Moore   
1                       563.0                  1000.0     Orlando Bloom   
2                         0.0             

In [11]:
df['imdb_score']

0       high
1       high
2        low
3       high
4       high
        ... 
5038    high
5039    high
5040     low
5041     low
5042     low
Name: imdb_score, Length: 5043, dtype: object

In [12]:
df = df.dropna()

In [13]:
# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [14]:
# Select the relevant features for modeling
features = ['num_critic_for_reviews', 'duration', 'director_facebook_likes', 'actor_1_facebook_likes', 'gross', 'num_voted_users', 'cast_total_facebook_likes', 'facenumber_in_poster', 'num_user_for_reviews', 'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score', 'movie_facebook_likes']
train_X = train_df[features]
train_y = train_df['imdb_score']
test_X = test_df[features]
test_y = test_df['imdb_score']

In [16]:
# Fill in missing values in the train dataset
#imputer = SimpleImputer(strategy='median')
#train_X = imputer.fit_transform(train_X)

In [17]:
# Scale the features in the train dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)

# Only numerical features are used, no further preprocessing needed for Decision Tree

ValueError: could not convert string to float: 'low'

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [18]:
print(train_y)

463     6.9
3688    5.0
1270    6.5
2118    7.8
594     6.9
       ... 
4426    5.7
466     6.3
3092    6.9
3772    7.2
860     6.4
Name: imdb_score, Length: 4034, dtype: float64


In [17]:
print(train_X)

[[ 1.54776598  0.4328296  -0.19553661 ...  3.29548144  0.41104037
   0.70447659]
 [-0.62777004 -0.44313412 -0.1885808  ... -0.26384451 -1.29550857
  -0.35157134]
 [ 0.60224456 -0.00515226 -0.22006502 ... -0.36872608  0.05176691
  -0.40220188]
 ...
 [-0.73654684  0.39301306 -0.08277917 ...  2.31067797  0.41104037
  -0.40220188]
 [ 0.45999797  0.59209573 -0.23031569 ... -0.36429447  0.68049546
  -0.40220188]
 [-0.76164918 -0.12460186 -0.2383698  ...  2.31067797 -0.03805146
   0.04046951]]


In [15]:
# Train the decision tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_X, train_y)

ValueError: Unknown label type: 'continuous'

In [None]:
# Predict the test dataset using the trained model
test_X = imputer.transform(test_X)
test_X = scaler.transform(test_X)
test_y_pred = dt_model.predict(test_X)

In [None]:
# Evaluate the performance of the model
accuracy = accuracy_score(test_y, test_y_pred)
print(f"Accuracy of the decision tree model: {accuracy}")