In [None]:
!gdown --id 1zu97TwzyU2T8OVvQ2VMof-XqHDX_fxPf

Downloading...
From: https://drive.google.com/uc?id=1zu97TwzyU2T8OVvQ2VMof-XqHDX_fxPf
To: /content/data_merged.csv
100% 227M/227M [00:01<00:00, 118MB/s]


In [None]:
import pandas as pd
import numpy as np

from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

from sklearn.linear_model import (
    LinearRegression, 
    SGDRegressor, 
    LogisticRegression
)
from sklearn.svm import LinearSVC

# from sklearn.preprocessing import normalize
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

SEED = 42
TEST_SIZE = 0.2

In [None]:
data = pd.read_csv('./data_merged.csv')
print(data.columns)

Index(['Unnamed: 0', 'adult', 'belongs_to_collection', 'budget', 'genres',
       'id', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'status', 'title', 'video', 'vote_average',
       'vote_count', 'cast', 'crew', 'keywords'],
      dtype='object')


In [None]:
relevant_cols = ['revenue', 'budget', 'runtime', 'popularity', 'release_date']
data = data.fillna(0)
data['release_date'] = data['release_date'].apply(
    lambda x: 2022 - int(str(x)[:4])
)

In [None]:
X = data[relevant_cols].to_numpy()
y = np.array(data['vote_average'])

In [None]:
print(len(X), len(y))
print(X.shape, y.shape)
print(X[0], y[0])

46628 46628
(46628, 5) (46628,)
[3.73554033e+08 3.00000000e+07 8.10000000e+01 2.19469430e+01
 2.70000000e+01] 7.7


In [None]:
def process(X, y):
    new_X, new_y = [], []
    
    for Xi, yi in zip(X, y):
        if any(Xij <= 0 for Xij in Xi): continue

        new_X.append(Xi)
        new_y.append(yi)

    return new_X, new_y

X, y = process(X, y)
print(len(X), len(y))

5446 5446


In [None]:
def regression(X, y, choice='lr'):
    X_train, X_test, y_train, y_test = train_test_split(
       X, y, test_size=TEST_SIZE, random_state=SEED
    )

    regressor = None
    if choice == 'lr':
        regressor = LinearRegression()
    elif choice == 'sgd':
        regressor = SGDRegressor(max_iter=2000, tol=1e-3)
    else:
        raise NotImplementedError

    model = make_pipeline(StandardScaler(), regressor)
    model.fit(X_train, y_train)
    print(f"{choice} score: {model.score(X_test, y_test)}")
    return

In [None]:
regression(X, y, 'lr')
regression(X, y, 'sgd')

lr score: 0.19881318571349116
sgd score: 0.19652343917416404


In [None]:
def classification(X, y, choice='lr'):
    y = list(map(int, y))

    X_train, X_test, y_train, y_test = train_test_split(
       X, y, test_size=TEST_SIZE, random_state=SEED
    )

    classifier = None
    if choice == 'lr':
        classifier = LogisticRegression(max_iter=5000)
    elif choice == 'svc':
        classifier = LinearSVC(max_iter=5000)
    else:
        raise NotImplementedError

    model = make_pipeline(StandardScaler(), classifier)
    model.fit(X_train, y_train)
    print(f"{choice} score: {model.score(X_test, y_test)}")
    return

In [None]:
classification(X, y, 'lr')
classification(X, y, 'svc')

lr score: 0.4908256880733945
svc score: 0.44954128440366975


