In [91]:
import numpy as np
import csv
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder


#reads in the data
data = pd.read_csv('movie_metadata.csv')

#removes the values that aren't numeric
to_drop = ['director_name', 'num_critic_for_reviews', 'actor_2_name','actor_1_name', 'movie_title', 'num_voted_users'
           , 'actor_3_name', 'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews']

#makes the new data set without the to_drop colums
features_list = data.columns.difference(to_drop)
movie_data = data[features_list]
#print(np.sum(movie_num.isnull()))
#print(movie_data.content_rating.unique())
pd.options.mode.chained_assignment = None 
print(movie_data.columns)   
print('Done')

Index(['actor_1_facebook_likes', 'actor_2_facebook_likes',
       'actor_3_facebook_likes', 'aspect_ratio', 'budget',
       'cast_total_facebook_likes', 'color', 'content_rating', 'country',
       'director_facebook_likes', 'duration', 'facenumber_in_poster', 'genres',
       'gross', 'imdb_score', 'language', 'movie_facebook_likes',
       'title_year'],
      dtype='object')
Done


# Changing Genres into individual Columns

In [92]:
# make a set with all unique genres

genres = []

for string in movie_data['genres']:
    genre = string.split('|')
    genres = genres+genre
    
genres_set = set(genres)
print(genres_set)

{'Sci-Fi', 'History', 'News', 'Crime', 'Family', 'Short', 'Reality-TV', 'Fantasy', 'Romance', 'Animation', 'Documentary', 'Film-Noir', 'Adventure', 'Musical', 'Sport', 'Western', 'Drama', 'Mystery', 'Thriller', 'Biography', 'Horror', 'Comedy', 'Action', 'War', 'Game-Show', 'Music'}


In [93]:
genres_dict = dict()
for genre in genres_set:
    genres_dict[genre] = []
    
for string in movie_data.genres:
    genres = string.split('|')
    for genre in genres_set:
        if genre in genres:
            genres_dict[genre] = genres_dict[genre]+[1]
        else:
            genres_dict[genre] = genres_dict[genre]+[0]

del movie_data['genres']
#print(genres_dict['Short'])

for genre in genres_set:
    series = pd.Series(genres_dict[genre])
    movie_data[genre] = series

#print(movie_data['Short'].values)    
print('Done')

Done


# Deleting NaN's

In [94]:
NA_THRESH = 4

#deletes the row when there are more or equal to the threshod number of NaN's
def remove_too_many_NaN(data, threshold):
    print("Length before deletion: {}".format(len(data)))

    remove_indices = []
    for index, nNaN in data.isnull().sum(axis=1).iteritems():
        if nNaN >= threshold:
            remove_indices.append(index)
    
    # drop movies with too many NaNs
    data = data.drop(data.index[remove_indices])
    print("Length after deletion: {}".format(len(data)))
    
    return data

#returns length
movie_data = remove_too_many_NaN(movie_data, NA_THRESH)

Length before deletion: 5043
Length after deletion: 4900


# Changing Color to Numeric

In [95]:
#replaces the color value to 1
movie_data.color = movie_data.color.replace(to_replace = 'Color', value = 1)
movie_data.color = movie_data.color.replace(to_replace = 'NaN', value = 1)

#replaces the NaN or black and white values to 0
for item in movie_data.color:
    if item != 1:
        movie_data.color = movie_data.color.replace(to_replace = item, value = 0)
        
#movie_data.color = new_color_column 
#print(movie_data['color'])

#makes sure that there is nog error where it shouldn't be
pd.options.mode.chained_assignment= None
print('Done')

Done


# Changing Country to Numeric

In [96]:
#replaces the USA values to 1
movie_data.country = movie_data.country.replace(to_replace ='USA', value = 1)
movie_data.country = movie_data.country.replace(to_replace ='NaN', value = 1)
#replaces the NaN or non USA values to 0
for item in movie_data.country:
    if item != 1:
        movie_data.country = movie_data.country.replace(to_replace = item, value = 0)
       
#print(movie_data['country'])

#makes sure that there is nog error where it shouldn't be
pd.options.mode.chained_assignment= None

print('Done')

Done


# Changing Languange to Numeric

In [97]:
#replaces the English values to 1
movie_data.language = movie_data.language.replace(to_replace = 'English', value = 1)
movie_data.language = movie_data.language.replace(to_replace = 'NaN', value = 1)

#replaces the other values to 0
for item in movie_data.language:
    if item != 1:
        movie_data.language = movie_data.language.replace(to_replace = item, value = 0)
        
#print(movie_data['language'])

#makes sure that there is nog error where it shouldn't be
pd.options.mode.chained_assignment= None
print('Done')

Done


# Chaning Content_Rating to Numeric

In [98]:
def content_to_numerical(data):
    data = data.replace(to_replace='G', value=0)
    data = data.replace(to_replace='PG', value=12)
    data = data.replace(to_replace='PG-13', value=13)
    data = data.replace(to_replace='R', value=17)
    data = data.replace(to_replace='NC-17', value=17)
    
    data = data.replace(to_replace='TV-PG', value=12)
    data = data.replace(to_replace='TV-MA', value=17)
    data = data.replace(to_replace='TV-G', value=0)
    data = data.replace(to_replace='TV-Y', value=0)
    data = data.replace(to_replace='TV-Y7', value=7)
    data = data.replace(to_replace='TV-14', value=14)
    
    data = data.replace(to_replace='Not Rated', value=0)
    data = data.replace(to_replace='Unrated', value=0)
    data = data.replace(to_replace='Approved', value=0)
    data = data.replace(to_replace='Passed', value=0)
    
    data = data.replace(to_replace='X', value=17)
    data = data.replace(to_replace='M', value=17)
    data = data.replace(to_replace='GP', value=12)
    
    return data

movie_data = content_to_numerical(movie_data)

# Replacing NaNs with averages

In [99]:
def replace_NaNs(col):
    # compute average
    avg = np.sum(col) / (len(col) - np.sum(col.isnull()))
    print(avg)
    
    # replace NaNs with average
    col = col.fillna(value=avg)
    return col

pd.options.mode.chained_assignment= None

to_replace_NaNs = ['actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes', 
               'aspect_ratio', 'budget', 'cast_total_facebook_likes', 'director_facebook_likes','gross', 'title_year'
                   , 'duration', 'facenumber_in_poster', 'content_rating']
for column in to_replace_NaNs:
    movie_data[column] = replace_NaNs(movie_data[column])


6648.62979592
1684.1759902
653.779595175
2.1284344634
39892924.8304
9849.74408163
692.662510221
48514997.6517
2002.39901881
108.243568804
1.36272504092
13.6882617062


# Delete columns with too little 1's

In [100]:
for genre in genres_set:
    print(genre, ':')
    counter = 0
    for value in movie_data[genre]:
        if value == 1.0:
            counter+=1
    print('aantal samples:' ,counter)
    
weghalen = ['Game-Show', 'News', 'Reality-TV', 'Short', 'Film-Noir']

for name in weghalen:
    del movie_data[name]
    
#print(movie_data)

Sci-Fi :
aantal samples: 596
History :
aantal samples: 203
News :
aantal samples: 3
Crime :
aantal samples: 858
Family :
aantal samples: 532
Short :
aantal samples: 3
Reality-TV :
aantal samples: 0
Fantasy :
aantal samples: 593
Romance :
aantal samples: 1088
Animation :
aantal samples: 233
Documentary :
aantal samples: 111
Film-Noir :
aantal samples: 6
Adventure :
aantal samples: 910
Musical :
aantal samples: 132
Sport :
aantal samples: 181
Western :
aantal samples: 97
Drama :
aantal samples: 2509
Mystery :
aantal samples: 475
Thriller :
aantal samples: 1375
Biography :
aantal samples: 292
Horror :
aantal samples: 550
Comedy :
aantal samples: 1824
Action :
aantal samples: 1131
War :
aantal samples: 208
Game-Show :
aantal samples: 0
Music :
aantal samples: 213


# Trainset en testset

In [101]:
ratings = movie_data['imdb_score'].values
del movie_data['imdb_score']

X = movie_data.values
X_std = StandardScaler().fit_transform(X)

number_of_samples = len(ratings)
np.random.seed(0)
random_indices = np.random.permutation(number_of_samples)
num_training_samples = round(number_of_samples*0.6)
num_validation_samples = round(number_of_samples*0.2) + num_training_samples

movie_training = X_std[random_indices[:num_training_samples]]
ratings_training = ratings[random_indices[:num_training_samples]]
training_indices = random_indices[:num_training_samples]

movie_validation = X_std[random_indices[num_training_samples:num_validation_samples]]
ratings_validation = ratings[random_indices[num_training_samples:num_validation_samples]]
validation_indices = random_indices[num_training_samples:num_validation_samples]

movie_test = X_std[random_indices[num_validation_samples:]]
ratings_test = ratings[random_indices[num_validation_samples:]]
test_indices = random_indices[num_validation_samples:]

ratings_training = list(ratings_training)

print('aantal training samples: ', len(ratings_training))
print('aantal validation samples: ', len(ratings_validation))
print('aantal test samples: ', len(ratings_test))

aantal training samples:  2940
aantal validation samples:  980
aantal test samples:  980


# Leren

In [102]:
def relative_error(y_predict, y):
    
    error = 0

    for i in range(len(y)):
        error += (abs(y_predict[i]-y[i]))/y[i]
    training_error = error/len(y)*100
    
    return training_error

### neural networks

In [103]:
from sklearn.neural_network import MLPRegressor

neural = MLPRegressor(hidden_layer_sizes =(100), activation = 'logistic', solver = 'adam', max_iter = 1000)

neural.fit(movie_training, ratings_training)
y_neural_train = neural.predict(movie_training)
y_neural_test = neural.predict(movie_test)

training_error = relative_error(y_neural_train, ratings_training)

print("Train error = "+'{}'.format(training_error) + " percent"+" in neural network algorithm")

test_error = relative_error(y_neural_test,ratings_test)

print("Test error = "'{}'.format(test_error)+" percent"+" in neural network algorithm")

Train error = 10.411694919423319 percent in neural network algorithm
Test error = 10.934604009383447 percent in neural network algorithm


### k-nearest neighbors

In [104]:
neighbors = KNeighborsRegressor(n_neighbors = 15)
neighbors.fit(movie_training, ratings_training)
y_neighbors_train = neighbors.predict(movie_training)

train_error = relative_error(y_neighbors_train,ratings_training)

print("Train error = "+'{}'.format(training_error)+" percent"+" in knn algorithm")

y_neighbors_test = neighbors.predict(movie_test)

test_error = relative_error(y_neighbors_test, ratings_test)

print("Test error = "'{}'.format(test_error)+" percent"+" in knn algorithm")

Train error = 10.411694919423319 percent in knn algorithm
Test error = 12.253240376043191 percent in knn algorithm


### Support vector machine

In [105]:
from sklearn.svm import SVR, NuSVR, LinearSVR

svr = SVR()
svr.fit(movie_training, ratings_training)
y_svr_train = svr.predict(movie_training)

train_error = relative_error(y_svr_train, ratings_training)

print ("Train error = "+'{}'.format(train_error))

y_svr_test = svr.predict(movie_test)

test_error = relative_error(y_svr_test, ratings_test)

print ("Test error = "+'{}'.format(test_error))

Train error = 9.150128853631939
Test error = 10.858612788391122


### Decision trees

In [106]:
from sklearn.tree import DecisionTreeRegressor

trees = DecisionTreeRegressor()
trees.fit(movie_training, ratings_training)
y_trees_train = trees.predict(movie_training)

test_error = relative_error(y_trees_train, ratings_training)

print("Train error = "+'{}'.format(training_error)+" percent"+" in decision trees")

y_trees_test = trees.predict(movie_test)

test_error = relative_error(y_trees_test, ratings_test)

print("Test error = "'{}'.format(test_error)+" percent"+" in decision trees")

Train error = 10.411694919423319 percent in decision trees
Test error = 14.311636958798108 percent in decision trees


### Random Forest

In [107]:
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor

def random_forest(movie_training, ratings_training, movie_test, ratings_test):
    
    random_forest = RandomForestRegressor(n_estimators = 20)
    random_forest.fit(movie_training, ratings_training)
    y_forest_train = random_forest.predict(movie_training)

    test_error = relative_error(y_forest_train, ratings_training)

    #print("Train error = "+'{}'.format(training_error)+" percent"+" in random forest")

    y_forest_test = random_forest.predict(movie_test)

    test_error = relative_error(y_forest_test, ratings_test)

    #print("Test error = "'{}'.format(test_error)+" percent"+" in random forest")
    
    return y_forest_test

y_forest_test = random_forest(movie_training, ratings_training, movie_test, ratings_test)

### Extremely randomized trees

In [108]:
def extra_trees(movie_training, ratings_training, movie_test, ratings_test):

    extra = ExtraTreesRegressor(n_estimators = 20)
    extra.fit(movie_training, ratings_training)
    y_extra_train = extra.predict(movie_training)

    test_error = relative_error(y_extra_train, ratings_training)

    #print("Train error = "+'{}'.format(training_error)+" percent"+" in random forest")

    y_extra_test = extra.predict(movie_test)

    test_error = relative_error(y_extra_test, ratings_test)

    #print("Test error = "'{}'.format(test_error)+" percent"+" in random forest")
    
    return y_extra_test

y_extra_test = extra_trees(movie_training, ratings_training, movie_test, ratings_test)

### Bagging regressor

In [109]:
#regressor  = BaggingRegressor(MLPRegressor(max_iter = 300))
bagging = BaggingRegressor(RandomForestRegressor())
bagging.fit(movie_training, ratings_training)
y_bagging_train = bagging.predict(movie_training)

test_error = relative_error(y_bagging_train ,ratings_training)

print("Train error = "+'{}'.format(training_error)+" percent"+" in random forest")

y_bagging_test = bagging.predict(movie_test)

test_error = relative_error(y_bagging_test, ratings_test)

print("Test error = "'{}'.format(test_error)+" percent"+" in random forest")

Train error = 10.411694919423319 percent in random forest
Test error = 10.425263913340322 percent in random forest


# Validation

### neural networks

In [110]:
# neural_1 = MLPRegressor(hidden_layer_sizes =(100), activation = 'logistic', solver = 'adam', max_iter = 1000)
# neural_2 = MLPRegressor(hidden_layer_sizes =(50), activation = 'logistic', solver = 'adam', max_iter = 1000)
# neural_3 = MLPRegressor(hidden_layer_sizes =(10), activation = 'logistic', solver = 'adam', max_iter = 1000)
# neural_4 = MLPRegressor(hidden_layer_sizes =(200), activation = 'logistic', solver = 'adam', max_iter = 1000)

# neural_1.fit(movie_training, ratings_training)
# neural_2.fit(movie_training, ratings_training)
# neural_3.fit(movie_training, ratings_training)
# neural_4.fit(movie_training, ratings_training)

# y_1_neural = neural_1.predict(movie_validation)
# y_2_neural = neural_2.predict(movie_validation)
# y_3_neural = neural_3.predict(movie_validation)
# y_4_neural = neural_4.predict(movie_validation)

# error_1_neural = relative_error(y_1_neural, ratings_validation)
# error_2_neural = relative_error(y_2_neural, ratings_validation)
# error_3_neural = relative_error(y_3_neural, ratings_validation)
# error_4_neural = relative_error(y_4_neural, ratings_validation)

# print(error_1_neural)
# print(error_2_neural)
# print(error_3_neural)
# print(error_4_neural)

### Support vector machines

In [111]:
# de parameters die kunnen worden aangepast 

# svr_1 = SVR(C = 0.0001)
# svr_2 = SVR(C = 0.01)
# svr_3 = SVR(C = 1.0)
# svr_4 = SVR(C = 100)
# svr_5 = SVR(C = 10000)

# svr_1 = SVR(epsilon = 0.001)
# svr_2 = SVR(epsilon = 0.01)
# svr_3 = SVR(epsilon = 0.1)
# svr_4 = SVR(epsilon = 10)
# svr_5 = SVR(epsilon = 100)

# svr_1 = SVR(kernel = 'rbf')
# svr_2 = SVR(kernel = 'linear')
# svr_3 = SVR(kernel = 'poly')
# svr_4 = SVR()
# svr_5 = SVR()

# svr_1.fit(movie_training, ratings_training)
# svr_2.fit(movie_training, ratings_training)
# svr_3.fit(movie_training, ratings_training)
# svr_4.fit(movie_training, ratings_training)
# svr_5.fit(movie_training, ratings_training)

# y_1_svr = svr_1.predict(movie_validation)
# y_2_svr = svr_2.predict(movie_validation)
# y_3_svr = svr_3.predict(movie_validation)
# y_4_svr = svr_4.predict(movie_validation)
# y_5_svr = svr_5.predict(movie_validation)

# error_1_svr = relative_error(y_1_svr, ratings_validation)
# error_2_svr = relative_error(y_2_svr, ratings_validation)
# error_3_svr = relative_error(y_3_svr, ratings_validation)
# error_4_svr = relative_error(y_4_svr, ratings_validation)
# error_5_svr = relative_error(y_5_svr, ratings_validation)

# print(error_1_svr)
# print(error_2_svr)
# print(error_3_svr)
# print(error_4_svr)
# print(error_5_svr)

### Random forest

In [112]:
# forest_1 = RandomForestRegressor(n_estimators = 5)
# forest_2 = RandomForestRegressor(n_estimators = 10)
# forest_3 = RandomForestRegressor(n_estimators = 20)
# forest_4 = RandomForestRegressor(n_estimators = 40)
# forest_5 = RandomForestRegressor(n_estimators = 80)

# forest_1.fit(movie_training, ratings_training)
# forest_2.fit(movie_training, ratings_training)
# forest_3.fit(movie_training, ratings_training)
# forest_4.fit(movie_training, ratings_training)
# forest_5.fit(movie_training, ratings_training)

# y_1_forest = forest_1.predict(movie_validation)
# y_2_forest = forest_2.predict(movie_validation)
# y_3_forest = forest_3.predict(movie_validation)
# y_4_forest = forest_4.predict(movie_validation)
# y_5_forest = forest_5.predict(movie_validation)

# error_1_forest = relative_error(y_1_forest, ratings_validation)
# error_2_forest = relative_error(y_2_forest, ratings_validation)
# error_3_forest = relative_error(y_3_forest, ratings_validation)
# error_4_forest = relative_error(y_4_forest, ratings_validation)
# error_5_forest = relative_error(y_5_forest, ratings_validation)

# print(error_1_forest)
# print(error_2_forest)
# print(error_3_forest)
# print(error_4_forest)
# print(error_5_forest)

### Extremely randomized trees

In [113]:
# extra_1 = ExtraTreesRegressor(n_estimators = 5)
# extra_2 = ExtraTreesRegressor(n_estimators = 10)
# extra_3 = ExtraTreesRegressor(n_estimators = 20)
# extra_4 = ExtraTreesRegressor(n_estimators = 40)
# extra_5 = ExtraTreesRegressor(n_estimators = 80)

# # extra_1 = ExtraTreesRegressor(n_estimators = 20, max_features = 'auto')
# # extra_2 = ExtraTreesRegressor(n_estimators = 20, max_features = 'sqrt')
# # extra_3 = ExtraTreesRegressor(n_estimators = 20, max_features = 'log2')
# # extra_4 = ExtraTreesRegressor()
# # extra_5 = ExtraTreesRegressor()

# extra_1.fit(movie_training, ratings_training)
# extra_2.fit(movie_training, ratings_training)
# extra_3.fit(movie_training, ratings_training)
# extra_4.fit(movie_training, ratings_training)
# extra_5.fit(movie_training, ratings_training)

# y_1_extra = extra_1.predict(movie_validation)
# y_2_extra = extra_2.predict(movie_validation)
# y_3_extra = extra_3.predict(movie_validation)
# y_4_extra = extra_4.predict(movie_validation)
# y_5_extra = extra_5.predict(movie_validation)

# error_1_extra = relative_error(y_1_extra, ratings_validation)
# error_2_extra = relative_error(y_2_extra, ratings_validation)
# error_3_extra = relative_error(y_3_extra, ratings_validation)
# error_4_extra = relative_error(y_4_extra, ratings_validation)
# error_5_extra = relative_error(y_5_extra, ratings_validation)

# print(error_1_extra)
# print(error_2_extra)
# print(error_3_extra)
# print(error_4_extra)
# print(error_5_extra)

### k-nearest neighbor

In [114]:
# neighbor_1 = KNeighborsRegressor(n_neighbors = 1)
# neighbor_2 = KNeighborsRegressor(n_neighbors = 5)
# neighbor_3 = KNeighborsRegressor(n_neighbors = 10)
# neighbor_4 = KNeighborsRegressor(n_neighbors = 20)
# neighbor_5 = KNeighborsRegressor(n_neighbors = 40)

# neighbor_1.fit(movie_training, ratings_training)
# neighbor_2.fit(movie_training, ratings_training)
# neighbor_3.fit(movie_training, ratings_training)
# neighbor_4.fit(movie_training, ratings_training)
# neighbor_5.fit(movie_training, ratings_training)

# y_1_neighbor = neighbor_1.predict(movie_validation)
# y_2_neighbor = neighbor_2.predict(movie_validation)
# y_3_neighbor = neighbor_3.predict(movie_validation)
# y_4_neighbor = neighbor_4.predict(movie_validation)
# y_5_neighbor = neighbor_5.predict(movie_validation)

# error_1_neighbor = relative_error(y_1_neighbor, ratings_validation)
# error_2_neighbor = relative_error(y_2_neighbor, ratings_validation)
# error_3_neighbor = relative_error(y_3_neighbor, ratings_validation)
# error_4_neighbor = relative_error(y_4_neighbor, ratings_validation)
# error_5_neighbor = relative_error(y_5_neighbor, ratings_validation)

# print(error_1_neighbor)
# print(error_2_neighbor)
# print(error_3_neighbor)
# print(error_4_neighbor)
# print(error_5_neighbor)

# Ensemble

### Neural networks and svr and random forest

In [116]:
error_ensembles = []

for i in range(100):
    
    y_forest_ens = random_forest(movie_training, ratings_training, movie_test, ratings_test)
    y_extra_ens = extra_trees(movie_training, ratings_training, movie_test, ratings_test)
    prediction = [sum(x)/3 for x in zip(y_svr_test, y_forest_ens, y_extra_ens)]
    error_ensemble = relative_error(prediction, ratings_test)
    
    error_ensembles.append(error_ensemble)

In [117]:
    
error_ensembles = np.array(error_ensembles)
    
mu = np.sum(error_ensembles)/len(error_ensembles)
sigma = (np.square(np.sum(error_ensembles-mu)))/len(error_ensembles)

print(mu,sigma)

# Error analyse

In [1]:
counter = 0
uitschieters = []

for i in range(len(prediction)):
    if abs(prediction[i]-ratings_test[i])>1.0:
        counter+= 1
        index = i
        uitschieters.append(i)
        #print(prediction[i], ratings_test[i])    

print('fractie die meer dan twee punten verschilt = ', counter/980)
indices = test_indices[uitschieters]
#print(data.iloc[indices]['budget'].describe())
print('fractie die NaN heeft bij gross: ', data.iloc[indices]['gross'].isnull().sum()/len(uitschieters))

NameError: name 'prediction' is not defined