# IMDB Movie Analysis

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error

from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

sns.set(color_codes=True)
sns.set(style="ticks")

data_frame_base = pd.read_csv("movie_metadata.csv")
data_frame_base = data_frame_base.drop('plot_keywords',1)
data_frame_base = data_frame_base.drop('genres',1)
data_frame_base = data_frame_base.drop('movie_imdb_link', 1)
data_frame_base = data_frame_base.drop('movie_title',1)
#data_frame_base.head()

## Imdb Score Limit and Imdb Score Classes For Classification

In [None]:
imdb_score_limit = 7.5

data_frame_base['imdb_score_class'] = data_frame_base['imdb_score'].copy()

data_frame_base.loc[data_frame_base['imdb_score_class'] < imdb_score_limit, 'imdb_score_class'] = 0
data_frame_base.loc[data_frame_base['imdb_score_class'] >= imdb_score_limit, 'imdb_score_class'] = 1


## Preprocessing

In [None]:
data_frame = data_frame_base.copy()

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


for column_name in data_frame.columns:
    if column_name == 'imdb_score': continue
    if column_name == 'imdb_score_class': continue
    
    le = LabelEncoder()
    scaler = StandardScaler()
    
    column = data_frame[column_name]
    normalized_column = column.fillna(0) if column.dtype.kind in 'biufc' else column.fillna('UNKNOWN')
    
    if (column.dtype.kind in 'biufc'): 
        normalized_column = normalized_column.astype(float)
        normalized_column = scaler.fit_transform(normalized_column.values.reshape(-1,1))
        class_values = pd.Series(list(scaler.fit_transform(normalized_column))) 
        data_frame[column_name].update(class_values)
        
        #class_values = normalized_column
        #data_frame[column_name].update(class_values)
    else: 
        normalized_column = pd.Series(list(le.fit_transform(normalized_column))) 
        normalized_column = normalized_column.astype(float)
        normalized_column = scaler.fit_transform(normalized_column.values.reshape(-1,1))
        class_values = pd.Series(list(scaler.fit_transform(normalized_column))) 
        data_frame[column_name].update(class_values)
        
        #class_values = pd.Series(list(le.fit_transform(normalized_column))) 
        #data_frame[column_name].update(class_values)

        

## Training and Test Sets

In [None]:
data_frame_temp = data_frame.copy()
class0 = data_frame_temp[data_frame_temp['imdb_score_class']==0]
class1 = data_frame_temp[data_frame_temp['imdb_score_class']==1]
train0, test0, ign1, ign2 = train_test_split(class0,class0['imdb_score'], test_size=0.4)
train1, test1, ign1, ign2 = train_test_split(class1,class1['imdb_score'], test_size=0.4)
train = pd.concat([train0,train1])
test = pd.concat([test0,test1])

-------------------------------------------------------------------
#### DATA PREPARATION IS FINISHED
#### APPLY ML METHODS
------------------------------------------------------------------------------------

## SVM Classification

In [None]:
features_svc = data_frame.columns
features_svc = features_svc[features_svc != 'imdb_score']
features_svc = features_svc[features_svc != 'imdb_score_class']
r_svc_feature = 'imdb_score_class'

train_svc_A = np.array(train[features_svc])
train_svc_R = np.array(train[r_svc_feature])
test_svc_A = np.array(test[features_svc])
test_svc_R = np.array(test[r_svc_feature])

##### Tuning Hyperparameters C and gamma

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

C_range = [0.1] #np.logspace(-1, 1, 3)
gamma_range = [0.1] #np.logspace(-1, 1, 3)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(train_svc_A, train_svc_R)

print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))

In [None]:

from sklearn.svm import SVC
svc_func = SVC(kernel='rbf',C=14,gamma = 0.012)
svc_func.fit(train_svc_A,train_svc_R)
y_svc = svc_func.predict(test_svc_A)

print(classification_report(test_svc_R,y_svc))
print(confusion_matrix(test_svc_R,y_svc))

## SVM Regression

In [None]:
features_svr = data_frame.columns
features_svr = features_svr[features_svr != 'imdb_score']
features_svr = features_svr[features_svr != 'imdb_score_class']
r_svr_feature = 'imdb_score'

train_svr_A = np.array(train[features_svr])
train_svr_R = np.array(train[r_svr_feature])
test_svr_A = np.array(test[features_svr])
test_svr_R = np.array(test[r_svr_feature])


##### Tuning Hyperparameters C and gamma

In [None]:

from sklearn.svm import SVR
svr_func= SVR(C=150,gamma=0.004,epsilon=1e-4)
svr_func.fit(train_svr_A,train_svr_R)
y_svr = svr_func.predict(test_svr_A)

print(y_svr)
print(test_svr_R)
print(len(y_svr[y_svr>=7.5]))
print("Explained variance score : "+ str(explained_variance_score(test_svr_R, y_svr)))
print("Mean squared error : "+str(mean_squared_error(test_svr_R, y_svr)))

margin = 0.4
y_svr_margined = [(1 if np.abs(x-y) > margin else 0) for x,y in zip(test_svr_R, y_svr)]
print("Prediction is in margin or not error : "+ str(np.sum(y_svr_margined)/len(test_svr_R)))


# SVM classification from regression result - it is almost identical with SVM classification
#limit = 7.5
#test_svr_R_limited = [(1 if x >= limit else 0) for x in test_svr_R]
#y_svr_limited = [(1 if x >= limit else 0) for x in y_svr]
#print(classification_report(test_svr_R_limited,y_svr_limited))
#print(confusion_matrix(test_svr_R_limited,y_svr_limited))
#print(accuracy_score(test_svr_R_limited,y_svr_limited))