In [29]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn import neighbors
from sklearn import tree
from sklearn import neural_network
from sklearn.model_selection import train_test_split

In [30]:
#function for cleaning data
def clean_Data(dataframe):
    #dropping all nan values
    dataframe = dataframe.dropna() 
    
    #using label encoder for encoding feature values
    le = preprocessing.LabelEncoder()
    for column in dataframe.columns:
        if column != "imdb_score":
            dataframe.loc[:, column] = le.fit_transform(dataframe.loc[:, column])
    return dataframe

In [31]:
#function for splitting data into features and label
def get_XY_data(dataframe):
    
    featureColumns = [col for col in dataframe.columns if col != "imdb_score"]
    X = dataframe[featureColumns]
    y = dataframe["imdb_score"]

    Xdata = X.values
    ydata = y.values

    return (Xdata, ydata)

In [32]:
#function for training and evaluating model
def train_and_eval(model):
    clf = model
    #training model
    clf.fit(train_x, train_y)
    
    #testing model
    pred_y = clf.predict(test_x)
    return (mean_absolute_error(test_y, pred_y), r2_score(test_y, pred_y))

In [33]:
#cleaning data and dividing it into training and testing set
data = pd.read_csv("movie_metadata.csv")
clean_data = clean_Data(data)
x,y = get_XY_data(clean_data)
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.20, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


# Trying different models

In [34]:
#SVM regression
model = svm.SVR()
print(train_and_eval(model))

(0.77868140146132991, 0.040570338283270502)


In [35]:
#Linear Bayesian Ridge model
model = linear_model.BayesianRidge()
print(train_and_eval(model))

(0.58387495055161054, 0.46116021130273654)


In [36]:
#Linear regression
model = linear_model.LinearRegression()
print(train_and_eval(model))

(0.58249083520025347, 0.46402425290944749)


In [37]:
#Passive agressive regression
model = linear_model.PassiveAggressiveRegressor()
print(train_and_eval(model))

(1.1790752316663555, -0.9345909256966467)


In [38]:
#K-nearest neighbours
model = neighbors.KNeighborsRegressor(weights="distance")
print(train_and_eval(model))

(0.68592858549239655, 0.2358459852473993)


In [39]:
#decision tree
model = tree.DecisionTreeRegressor()
print(train_and_eval(model))

(0.71210106382978722, 0.070965222873965339)


In [40]:
#Multi layer perceptron
model = neural_network.MLPRegressor(hidden_layer_sizes = (5,3), activation = "tanh", solver = "lbfgs")
print(train_and_eval(model))

(0.79246861548380487, 0.044635320231522502)


In [41]:
#Huber regression
model = linear_model.HuberRegressor()
print(train_and_eval(model))

(0.77952104861096705, 0.078208898986384168)
