# Total Gross Weights

In [1]:
# Read in data and store feature vectors in array X and labels in array y
import numpy as np
DATA = np.loadtxt('imdb_adjusted_total.csv', delimiter=',', skiprows=1)  # Read data from csv file
X = DATA[:, :-1]  # All columns except final column
y = DATA[:, -1]   # Final column is label
X.shape

(4286, 8)

In [2]:
# Split data into 80% training and 20% testing. Set random_state=0.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [3]:
# Train linear regression model on training data.

from sklearn.linear_model import LinearRegression

regr = LinearRegression()  # Create LinearRegression instance
regr.fit(X_train, y_train) # Learn hypothesis, i.e., find w_0 and w_1 for best fitting line
print('Regression score on test data:\t' + str(regr.score(X_test, y_test)))

Regression score on test data:	0.7907332445649808


In [4]:
from sklearn.feature_selection import f_regression
print(f_regression(X_train, y_train)[1])

[9.74616850e-054 6.44955662e-016 3.05271517e-030 5.18762700e-036
 0.00000000e+000 6.08041766e-001 1.12231235e-009 1.12002445e-307]


Most significant: Director, followed by year, censor, and runtime. Genres are not as significantly weighted.

# Predictors for Gross

In [5]:
# Read in data and store feature vectors in array X and labels in array y
import numpy as np
DATA = np.loadtxt('imdb_adjusted_total_discrete.csv', delimiter=',', skiprows=1)  # Read data from csv file
X = DATA[:, :-1]  # All columns except final column
y = DATA[:, -1]   # Final column is label
X.shape

(4286, 9)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [7]:
from sklearn import linear_model  # Using sklearn Perceptron and Logistics classifier
from sklearn import ensemble  # Using RandomForest classifier
from sklearn import neighbors  # Using nearest neighbors classifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

#for testing the best c and gamma values for svm
c = [0.00001, 0.0001,0.001, 0.01, 0.1, 1.0,10.0, 100.0, 1000.0,10000.0]
gamma = [0.000001,0.00001, 0.0001, 1.0, 10.0, 100.0,1000.0]

learners = {'Perceptron': linear_model.Perceptron(max_iter=10, random_state =0),
            'RandomForest': ensemble.RandomForestClassifier(),
            'kNN': neighbors.KNeighborsClassifier(metric = "manhattan"), 
            'logistic': linear_model.LogisticRegression(C = 10,random_state=0), #C is for regularization, which prevents overfitting with logistic regression
            'SVM':SVC(), 
            #we want to explicitly test various c and gamm values to help our svm perform better.
            #C and gamma values vary based on the dataset. 
            #C controls the margin. larger Cs allow for less error. 
            #gamma controls how relevant the training data is. larger gamma values mean the training data has more weight, which can lead to overfitting.
 
            'NeuralNetwork': MLPClassifier()
           }

#for neural networks, kNN, and SVM
scaler = StandardScaler()
scaler.fit(X_train)
Xtrainscaled = scaler.fit_transform(X_train)
Xtestscaled = scaler.transform(X_test)


for classifierName in learners:
    if classifierName == 'NeuralNetwork' or classifierName =='kNN':
        learners[classifierName].fit(Xtrainscaled, y_train)
        print('Accuracy of ' + classifierName + ':\t' + str(np.mean(cross_val_score(learners[classifierName], Xtrainscaled, y_train))))
    elif classifierName == 'SVM':
        for val in c:
            for value in gamma:
                svm = SVC(C = val, gamma = value, max_iter = 40)
                svm.fit(Xtrainscaled, y_train)
                print('Accuracy of ' + classifierName + ':\t' + str(np.mean(cross_val_score(svm, Xtrainscaled, y_train))) + ' with C: ' + str(val) + ' and gamma: '+ str(value))
                #c value of 0.0001 and gamma value of 0.000001 were the best. 
    else:
        learners[classifierName].fit(X_train, y_train)
        print('Accuracy of ' + classifierName + ':\t' + str(np.mean(cross_val_score(learners[classifierName], X_train, y_train))))
        




Accuracy of Perceptron:	0.9959166648932773
Accuracy of RandomForest:	1.0
Accuracy of kNN:	0.9970832712647102


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP:

Accuracy of logistic:	0.9994169096209913
Accuracy of SVM:	0.9892068694005236 with C: 1e-05 and gamma: 1e-06
Accuracy of SVM:	0.9892068694005236 with C: 1e-05 and gamma: 1e-05
Accuracy of SVM:	0.9892068694005236 with C: 1e-05 and gamma: 0.0001
Accuracy of SVM:	0.9892068694005236 with C: 1e-05 and gamma: 1.0
Accuracy of SVM:	0.9892068694005236 with C: 1e-05 and gamma: 10.0
Accuracy of SVM:	0.9892068694005236 with C: 1e-05 and gamma: 100.0
Accuracy of SVM:	0.9892068694005236 with C: 1e-05 and gamma: 1000.0
Accuracy of SVM:	0.9892068694005236 with C: 0.0001 and gamma: 1e-06
Accuracy of SVM:	0.9892068694005236 with C: 0.0001 and gamma: 1e-05
Accuracy of SVM:	0.9892068694005236 with C: 0.0001 and gamma: 0.0001
Accuracy of SVM:	0.9892068694005236 with C: 0.0001 and gamma: 1.0
Accuracy of SVM:	0.9892068694005236 with C: 0.0001 and gamma: 10.0
Accuracy of SVM:	0.9892068694005236 with C: 0.0001 and gamma: 100.0
Accuracy of SVM:	0.9892068694005236 with C: 0.0001 and gamma: 1000.0
Accuracy of SVM:



Accuracy of SVM:	0.9892068694005236 with C: 0.001 and gamma: 10.0
Accuracy of SVM:	0.9892068694005236 with C: 0.001 and gamma: 100.0




Accuracy of SVM:	0.9892068694005236 with C: 0.001 and gamma: 1000.0
Accuracy of SVM:	0.9892068694005236 with C: 0.01 and gamma: 1e-06
Accuracy of SVM:	0.9892068694005236 with C: 0.01 and gamma: 1e-05
Accuracy of SVM:	0.9892068694005236 with C: 0.01 and gamma: 0.0001




Accuracy of SVM:	0.9892068694005236 with C: 0.01 and gamma: 1.0
Accuracy of SVM:	0.9892068694005236 with C: 0.01 and gamma: 10.0




Accuracy of SVM:	0.9892068694005236 with C: 0.01 and gamma: 100.0
Accuracy of SVM:	0.9892068694005236 with C: 0.01 and gamma: 1000.0
Accuracy of SVM:	0.9892068694005236 with C: 0.1 and gamma: 1e-06
Accuracy of SVM:	0.9892068694005236 with C: 0.1 and gamma: 1e-05
Accuracy of SVM:	0.9892068694005236 with C: 0.1 and gamma: 0.0001
Accuracy of SVM:	0.9892068694005236 with C: 0.1 and gamma: 1.0




Accuracy of SVM:	0.9892068694005236 with C: 0.1 and gamma: 10.0
Accuracy of SVM:	0.9892068694005236 with C: 0.1 and gamma: 100.0




Accuracy of SVM:	0.9892068694005236 with C: 0.1 and gamma: 1000.0
Accuracy of SVM:	0.9892068694005236 with C: 1.0 and gamma: 1e-06
Accuracy of SVM:	0.9892068694005236 with C: 1.0 and gamma: 1e-05




Accuracy of SVM:	0.9892068694005236 with C: 1.0 and gamma: 0.0001
Accuracy of SVM:	0.9892068694005236 with C: 1.0 and gamma: 1.0
Accuracy of SVM:	0.9892068694005236 with C: 1.0 and gamma: 10.0




Accuracy of SVM:	0.9892068694005236 with C: 1.0 and gamma: 100.0
Accuracy of SVM:	0.9892068694005236 with C: 1.0 and gamma: 1000.0




Accuracy of SVM:	0.9892068694005236 with C: 10.0 and gamma: 1e-06
Accuracy of SVM:	0.9892068694005236 with C: 10.0 and gamma: 1e-05
Accuracy of SVM:	0.9897903853929476 with C: 10.0 and gamma: 0.0001




Accuracy of SVM:	0.989498414590028 with C: 10.0 and gamma: 1.0
Accuracy of SVM:	0.9892068694005236 with C: 10.0 and gamma: 10.0
Accuracy of SVM:	0.9892068694005236 with C: 10.0 and gamma: 100.0




Accuracy of SVM:	0.9892068694005236 with C: 10.0 and gamma: 1000.0
Accuracy of SVM:	0.9892068694005236 with C: 100.0 and gamma: 1e-06
Accuracy of SVM:	0.9897903853929476 with C: 100.0 and gamma: 1e-05
Accuracy of SVM:	0.9967917260752059 with C: 100.0 and gamma: 0.0001




Accuracy of SVM:	0.989498414590028 with C: 100.0 and gamma: 1.0
Accuracy of SVM:	0.9892068694005236 with C: 100.0 and gamma: 10.0
Accuracy of SVM:	0.9892068694005236 with C: 100.0 and gamma: 100.0




Accuracy of SVM:	0.9892068694005236 with C: 100.0 and gamma: 1000.0
Accuracy of SVM:	0.9897903853929476 with C: 1000.0 and gamma: 1e-06
Accuracy of SVM:	0.9967917260752059 with C: 1000.0 and gamma: 1e-05
Accuracy of SVM:	0.9970832712647102 with C: 1000.0 and gamma: 0.0001




Accuracy of SVM:	0.989498414590028 with C: 1000.0 and gamma: 1.0
Accuracy of SVM:	0.9892068694005236 with C: 1000.0 and gamma: 10.0
Accuracy of SVM:	0.9892068694005236 with C: 1000.0 and gamma: 100.0




Accuracy of SVM:	0.9892068694005236 with C: 1000.0 and gamma: 1000.0
Accuracy of SVM:	0.9967917260752059 with C: 10000.0 and gamma: 1e-06
Accuracy of SVM:	0.9970836968781256 with C: 10000.0 and gamma: 1e-05
Accuracy of SVM:	0.9973748164542146 with C: 10000.0 and gamma: 0.0001
Accuracy of SVM:	0.989498414590028 with C: 10000.0 and gamma: 1.0




Accuracy of SVM:	0.9892068694005236 with C: 10000.0 and gamma: 10.0
Accuracy of SVM:	0.9892068694005236 with C: 10000.0 and gamma: 100.0




Accuracy of SVM:	0.9892068694005236 with C: 10000.0 and gamma: 1000.0
Accuracy of NeuralNetwork:	0.9964997552722862


Accuracy of Perceptron: 99.5% <br>
Accuracy of Random Forest:100% <br>
Accuracy of Knn:99.7% <br>
Accuracy of logistic Regression: 99.9% <br>
Accuracy of SVM: 98.9% <br>
Accuracy of Neural Network: 99.6% <br>

# Rating Weights

In [8]:
# Read in data and store feature vectors in array X and labels in array y
import numpy as np
DATAratings = np.loadtxt('imdb_rating.csv', delimiter=',', skiprows=1)  # Read data from csv file
X = DATAratings[:, :-1]  # All columns except final column
y = DATAratings[:, -1]   # Final column is label
X.shape

(4286, 8)

In [9]:
# Split data into 80% training and 20% testing. Set random_state=0.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [10]:
# Train linear regression model on training data.

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

regr = LinearRegression()  # Create LinearRegression instance
regr.fit(X_train, y_train) # Learn hypothesis, i.e., find w_0 and w_1 for best fitting line
print('Regression score on test data:\t' + str(regr.score(X_test, y_test)))
print('Regression score on train data:\t' + str(regr.score(X_train, y_train)))

Regression score on test data:	0.18194071179630933
Regression score on train data:	0.1954631716171369


In [11]:
from sklearn.feature_selection import f_regression
print(f_regression(X_train, y_train)[1])

[9.38011257e-031 6.44955662e-016 2.74777639e-128 5.73831865e-001
 2.48242635e-004 6.39211253e-001 4.36410854e-005 5.16205302e-001]


Most weight: runtime

# Predictors for Rating

In [12]:
# Read in data and store feature vectors in array X and labels in array y
import numpy as np
DATA = np.loadtxt('imdb_rating_disrete.csv', delimiter=',', skiprows=1)  # Read data from csv file
X = DATA[:, :-1]  # All columns except final column
y = DATA[:, -1]   # Final column is label


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [14]:
from sklearn import linear_model  # Using sklearn Perceptron and Logistics classifier
from sklearn import ensemble  # Using RandomForest classifier
from sklearn import neighbors  # Using nearest neighbors classifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

learners = {'Perceptron': linear_model.Perceptron(max_iter=10, random_state = 0),
            'RandomForest': ensemble.RandomForestClassifier(),
            'kNN': neighbors.KNeighborsClassifier(metric = "manhattan"), 
            'logistic': linear_model.LogisticRegression(C = 10, random_state=0),
            'SVM':SVC(),
            'NeuralNetwork': MLPClassifier()
           }

#for testing the best c and gamma values for svm
c = [0.00001, 0.0001,0.001, 0.01, 0.1, 1.0,10.0, 100.0, 1000.0,10000.0]
gamma = [0.000001,0.00001, 0.0001, 1.0, 10.0, 100.0,1000.0]

#for Knn, svm, and neural networks
scaler = StandardScaler()

scaler.fit(X_train)
Xtrainscaled = scaler.transform(X_train)
Xtestscaled = scaler.transform(X_test)


for classifierName in learners:
    if classifierName == 'NeuralNetwork' or classifierName =='kNN':
        learners[classifierName].fit(Xtrainscaled, y_train)
        print('Accuracy of ' + classifierName + ':\t' + str(np.mean(cross_val_score(learners[classifierName], Xtrainscaled, y_train))))
    elif classifierName == 'SVM':
         for val in c:
            for value in gamma:
                svm = SVC(C = val,  gamma = value, max_iter = 40)
                svm.fit(Xtrainscaled, y_train)
                print('Accuracy of ' + classifierName + ':\t' + str(np.mean(cross_val_score(svm, Xtrainscaled, y_train))) + ' with C: ' + str(val) + ' and gamma: '+ str(value))
                # c = 0.0001 and gamma = 0.000001 were the best values
    else:
        learners[classifierName].fit(X_train, y_train)
        print('Accuracy of ' + classifierName + ':\t' + str(np.mean(cross_val_score(learners[classifierName], X_train, y_train))))
        
        



Accuracy of Perceptron:	0.6094005235045008
Accuracy of RandomForest:	1.0
Accuracy of kNN:	0.9329028962992914


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP:

Accuracy of logistic:	0.8389376689153242
Accuracy of SVM:	0.9451677980889958 with C: 1e-05 and gamma: 1e-06




Accuracy of SVM:	0.9541929305611714 with C: 1e-05 and gamma: 1e-05
Accuracy of SVM:	0.9562409823157626 with C: 1e-05 and gamma: 0.0001
Accuracy of SVM:	0.7365738120065545 with C: 1e-05 and gamma: 1.0




Accuracy of SVM:	0.5900989551190654 with C: 1e-05 and gamma: 10.0
Accuracy of SVM:	0.8042510267923644 with C: 1e-05 and gamma: 100.0




Accuracy of SVM:	0.6289361792683705 with C: 1e-05 and gamma: 1000.0
Accuracy of SVM:	0.9778319252622844 with C: 0.0001 and gamma: 1e-06
Accuracy of SVM:	0.9541929305611714 with C: 0.0001 and gamma: 1e-05




Accuracy of SVM:	0.9562409823157626 with C: 0.0001 and gamma: 0.0001
Accuracy of SVM:	0.7365738120065545 with C: 0.0001 and gamma: 1.0
Accuracy of SVM:	0.5411240450298993 with C: 0.0001 and gamma: 10.0




Accuracy of SVM:	0.804542997595284 with C: 0.0001 and gamma: 100.0
Accuracy of SVM:	0.6289361792683705 with C: 0.0001 and gamma: 1000.0




Accuracy of SVM:	0.9489689515013513 with C: 0.001 and gamma: 1e-06
Accuracy of SVM:	0.9541929305611714 with C: 0.001 and gamma: 1e-05
Accuracy of SVM:	0.9562409823157626 with C: 0.001 and gamma: 0.0001




Accuracy of SVM:	0.7173037390138537 with C: 0.001 and gamma: 1.0
Accuracy of SVM:	0.5784333170181524 with C: 0.001 and gamma: 10.0




Accuracy of SVM:	0.8071703092081461 with C: 0.001 and gamma: 100.0
Accuracy of SVM:	0.6289361792683705 with C: 0.001 and gamma: 1000.0
Accuracy of SVM:	0.969370304952012 with C: 0.01 and gamma: 1e-06




Accuracy of SVM:	0.9541929305611714 with C: 0.01 and gamma: 1e-05
Accuracy of SVM:	0.9562409823157626 with C: 0.01 and gamma: 0.0001
Accuracy of SVM:	0.7173037390138537 with C: 0.01 and gamma: 1.0




Accuracy of SVM:	0.49557872784150153 with C: 0.01 and gamma: 10.0
Accuracy of SVM:	0.8065863676023067 with C: 0.01 and gamma: 100.0




Accuracy of SVM:	0.6289361792683705 with C: 0.01 and gamma: 1000.0
Accuracy of SVM:	0.967621033814986 with C: 0.1 and gamma: 1e-06




Accuracy of SVM:	0.9541929305611714 with C: 0.1 and gamma: 1e-05
Accuracy of SVM:	0.9562409823157626 with C: 0.1 and gamma: 0.0001
Accuracy of SVM:	0.6186878338405227 with C: 0.1 and gamma: 1.0




Accuracy of SVM:	0.5404754101849291 with C: 0.1 and gamma: 10.0
Accuracy of SVM:	0.8036675107999404 with C: 0.1 and gamma: 100.0




Accuracy of SVM:	0.6289361792683705 with C: 0.1 and gamma: 1000.0
Accuracy of SVM:	0.9699533953310209 with C: 1.0 and gamma: 1e-06
Accuracy of SVM:	0.9541929305611714 with C: 1.0 and gamma: 1e-05




Accuracy of SVM:	0.9562409823157626 with C: 1.0 and gamma: 0.0001
Accuracy of SVM:	0.6482279585452534 with C: 1.0 and gamma: 1.0
Accuracy of SVM:	0.4276584877955353 with C: 1.0 and gamma: 10.0




Accuracy of SVM:	0.8051269392011235 with C: 1.0 and gamma: 100.0
Accuracy of SVM:	0.6292281500712902 with C: 1.0 and gamma: 1000.0




Accuracy of SVM:	0.9699533953310209 with C: 10.0 and gamma: 1e-06
Accuracy of SVM:	0.9541929305611714 with C: 10.0 and gamma: 1e-05
Accuracy of SVM:	0.956532527505267 with C: 10.0 and gamma: 0.0001




Accuracy of SVM:	0.7485335489774638 with C: 10.0 and gamma: 1.0
Accuracy of SVM:	0.575232278521419 with C: 10.0 and gamma: 10.0
Accuracy of SVM:	0.5644344661743738 with C: 10.0 and gamma: 100.0




Accuracy of SVM:	0.6292281500712902 with C: 10.0 and gamma: 1000.0
Accuracy of SVM:	0.9699533953310209 with C: 100.0 and gamma: 1e-06
Accuracy of SVM:	0.9541929305611714 with C: 100.0 and gamma: 1e-05




Accuracy of SVM:	0.9468932348747632 with C: 100.0 and gamma: 0.0001
Accuracy of SVM:	0.6613172735204613 with C: 100.0 and gamma: 1.0
Accuracy of SVM:	0.575232278521419 with C: 100.0 and gamma: 10.0




Accuracy of SVM:	0.5644344661743738 with C: 100.0 and gamma: 100.0
Accuracy of SVM:	0.6292281500712902 with C: 100.0 and gamma: 1000.0




Accuracy of SVM:	0.9699533953310209 with C: 1000.0 and gamma: 1e-06
Accuracy of SVM:	0.9547717648060268 with C: 1000.0 and gamma: 1e-05
Accuracy of SVM:	0.8602298312442809 with C: 1000.0 and gamma: 0.0001




Accuracy of SVM:	0.6613172735204613 with C: 1000.0 and gamma: 1.0
Accuracy of SVM:	0.575232278521419 with C: 1000.0 and gamma: 10.0




Accuracy of SVM:	0.5644344661743738 with C: 1000.0 and gamma: 100.0
Accuracy of SVM:	0.6292281500712902 with C: 1000.0 and gamma: 1000.0
Accuracy of SVM:	0.9451571577536123 with C: 10000.0 and gamma: 1e-06




Accuracy of SVM:	0.8433223383201037 with C: 10000.0 and gamma: 1e-05
Accuracy of SVM:	0.9335123747100509 with C: 10000.0 and gamma: 0.0001
Accuracy of SVM:	0.6613172735204613 with C: 10000.0 and gamma: 1.0




Accuracy of SVM:	0.575232278521419 with C: 10000.0 and gamma: 10.0
Accuracy of SVM:	0.5644344661743738 with C: 10000.0 and gamma: 100.0




Accuracy of SVM:	0.6292281500712902 with C: 10000.0 and gamma: 1000.0




Accuracy of NeuralNetwork:	0.9994169096209913




Accuracy of Perceptron: 60.9% <br>
Accuracy of Random Forest:100% <br>
Accuracy of Knn: 93.3% <br>
Accuracy of Logistic Regression:83.8% <br>
Accuracy of SVM: 97.7% <br>
Accuracy of Neural Networks:99.9% <br>