# Clustering 

In [1]:
# Load Libraries
import pandas as pd
import re
import warnings

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFECV

from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from math import sqrt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

import itertools
import numpy as np

# machine learning
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import itertools
import matplotlib.pyplot as plt
import numpy as np

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

In [3]:
#Load the data

df_movies = pd.read_csv("regressionPreprocessing.csv")
print("Length after import: " + str(len(df_movies)))
df_movies = df_movies.fillna(0)
df_movies.head(5)

Length after import: 7517


Unnamed: 0,budget,id,runtime,vote_count,History,Western,Music,Family,Comedy,Drama,...,orig_ml,orig_es,orig_bn,orig_ab,orig_wo,orig_ca,orig_ru,orig_id,orig_is,director
0,30000000,862,81.0,5415,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1709
1,65000000,8844,104.0,2413,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1645
2,60000000,949,170.0,1886,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2338
3,58000000,710,130.0,1194,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2231
4,98000000,1408,119.0,137,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2812


After we loaded all the libaries and data that we need, we can start with the classification tasks.

Since we have a lot of features it makes sense to eliminate features that do not have a significant impact on the prediction to improve our perfromance.

In [4]:
# drop columns that are not needed maybe implement feature selction...

# # Instantiate RFECV visualizer with a linear SVM classifier
# model = LogisticRegression()
# visualizer = RFECV(model, 3)

# visualizer.fit(features_train, rating_train)        # Fit the data to the visualizer
# visualizer.show()           # Finalize and render the figure

# # Create the RFE object and compute a cross-validated score.
# # The "accuracy" scoring is proportional to the number of correct
# # classifications
# # rfecv = RFECV(estimator=knn_reg, step=1, cv=stratified_10_fold_cv, scoring='accuracy')
# # rfecv.fit(features_train, rating_train)

# # print("Optimal number of features : %d" % rfecv.n_features_)

# # # Plot number of features VS. cross-validation scores
# # plt.figure()
# # plt.xlabel("Number of features selected")
# # plt.ylabel("Cross validation score (nb of correct classifications)")
# # plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
# # plt.show()


features_to_remove = ['vote_count','+18','id','actors','spokenLanguages','productionCountries','productionCompanies','original_language','hasHomepage','part_of_collection''orig_lv', 'orig_el', 'orig_fa', 'orig_tl', 'orig_ta', 'orig_th',
       'orig_mn', 'orig_zh', 'orig_te', 'orig_kk', 'orig_zu', 'orig_et',
       'orig_mr', 'orig_eu', 'orig_sv', 'orig_no', 'orig_pl', 'orig_cs',
       'orig_cy', 'orig_bs', 'orig_de', 'orig_lo', 'orig_xx', 'orig_ko',
       'orig_hu', 'orig_sr', 'orig_da', 'orig_pt', 'orig_nl', 'orig_en',
       'orig_it', 'orig_tr', 'orig_hr', 'orig_cn', 'orig_ka', 'orig_ar',
       'orig_ja', 'orig_0', 'orig_hi', 'orig_ro', 'orig_af', 'orig_sk',
       'orig_fr', 'orig_fi', 'orig_he', 'orig_uk', 'orig_bg', 'orig_ml',
       'orig_es', 'orig_bn', 'orig_ab', 'orig_wo', 'orig_ca', 'orig_ru',
       'orig_id', 'orig_is', 'orig_lv', '18+']
for i in features_to_remove:
    if i in df_movies.columns:
        df_movies = df_movies.drop(columns=i)
print(df_movies.head(5))
df_movies.columns

     budget  runtime  History  Western  Music  Family  Comedy  Drama  Foreign  \
0  30000000     81.0      0.0      0.0    0.0     1.0     1.0    0.0      0.0   
1  65000000    104.0      0.0      0.0    0.0     1.0     0.0    0.0      0.0   
2  60000000    170.0      0.0      0.0    0.0     0.0     0.0    1.0      0.0   
3  58000000    130.0      0.0      0.0    0.0     0.0     0.0    0.0      0.0   
4  98000000    119.0      0.0      0.0    0.0     0.0     0.0    0.0      0.0   

   Action  ...  Thriller  TV Movie  Crime  Science Fiction  Fantasy  War  \
0     0.0  ...       0.0       0.0    0.0              0.0      0.0  0.0   
1     0.0  ...       0.0       0.0    0.0              0.0      1.0  0.0   
2     1.0  ...       1.0       0.0    1.0              0.0      0.0  0.0   
3     1.0  ...       1.0       0.0    0.0              0.0      0.0  0.0   
4     1.0  ...       0.0       0.0    0.0              0.0      0.0  0.0   

   Adventure    rating  part_of_collection  director  
0

Index(['budget', 'runtime', 'History', 'Western', 'Music', 'Family', 'Comedy',
       'Drama', 'Foreign', 'Action', 'Horror', 'Mystery', 'Romance',
       'Animation', 'Documentary', 'Thriller', 'TV Movie', 'Crime',
       'Science Fiction', 'Fantasy', 'War', 'Adventure', 'rating',
       'part_of_collection', 'director'],
      dtype='object')

Now that we have exactly the data we want we start with the spitting.

1. First of all we separate the features from our targte(the rating).
2. We split our data into training and test data in order to evaluate our model later. The proportions will be 60% to 40%
3. We creaze a kfold cross validation that we will later use for our models in order to evaluate them better

In [5]:
# separate features and target variable
rating = df_movies['rating'] # weight
features = df_movies.drop(columns=['rating'])

# encode labels
lab_enc = LabelEncoder()
rating = lab_enc.fit_transform(rating)

# create a train/test split
features_train, features_test, rating_train, rating_test = train_test_split(
    features, rating, test_size=0.4, random_state=42)

print("Train: " + str(len(features_train)) + " Features and " + str(len(rating_train)) + " Ratings")
print("Test: " + str(len(features_test)) + " Features and " + str(len(rating_test)) + " Ratings")

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

Train: 4510 Features and 4510 Ratings
Test: 3007 Features and 3007 Ratings


# Initial evaluation of different classifiers

In [6]:
##### create and fit a RandomForestClassifier
rf_reg = RandomForestClassifier()
rf_reg.fit(features_train, rating_train)
rating_pred_rf = rf_reg.predict(features_test)

print("Scores Random Forest:")
#compute the confusion matrix
# cnf_matrix = confusion_matrix(rating_test, rating_pred_rf)
# print(cnf_matrix)

#compute accuracy score
print("Accuracy: {}".format(accuracy_score(rating_test, rating_pred_rf)))
print("Precision: {}".format(precision_score(rating_test, rating_pred_rf, average='weighted')))
print("Recall: {}".format(recall_score(rating_test, rating_pred_rf, average='weighted')))
print("f1_score: {}".format(f1_score(rating_test, rating_pred_rf, average='weighted')))

##### create and fit a KNN
knn_reg = KNeighborsClassifier()
knn_reg.fit(features_train, rating_train)
rating_pred_knn = knn_reg.predict(features_test)

print("Scores knn:")
# #compute the confusion matrix
# cnf_matrix = confusion_matrix(rating_test, rating_pred_knn)
# print(cnf_matrix)

#compute accuracy score
print("Accuracy: {}".format(accuracy_score(rating_test, rating_pred_knn)))
print("Precision: {}".format(precision_score(rating_test, rating_pred_knn, average='weighted')))
print("Recall: {}".format(recall_score(rating_test, rating_pred_knn, average='weighted')))
print("f1_score: {}".format(f1_score(rating_test, rating_pred_knn, average='weighted')))

##### create and fit a DecisionTreeClassifier
dt_reg = DecisionTreeClassifier()
dt_reg.fit(features_train, rating_train)
rating_pred_dt = dt_reg.predict(features_test)

print("Scores Decision Tree:")
#compute the confusion matrix
# cnf_matrix = confusion_matrix(rating_test, rating_pred_dt)
# print(cnf_matrix)

#compute accuracy score
print("Accuracy: {}".format(accuracy_score(rating_test, rating_pred_dt)))
print("Precision: {}".format(precision_score(rating_test, rating_pred_dt, average='weighted')))
print("Recall: {}".format(recall_score(rating_test, rating_pred_dt, average='weighted')))
print("f1_score: {}".format(f1_score(rating_test, rating_pred_dt, average='weighted')))

##### create and fit a GaussianNB
nb_reg = GaussianNB()
nb_reg.fit(features_train, rating_train)
rating_pred_nb = nb_reg.predict(features_test)

print("Scores Naive Bayes:")
# #compute the confusion matrix
# cnf_matrix = confusion_matrix(rating_test, rating_pred_nb)
# print(cnf_matrix)

#compute accuracy score
print("Accuracy: {}".format(accuracy_score(rating_test, rating_pred_nb)))
print("Precision: {}".format(precision_score(rating_test, rating_pred_nb, average='weighted')))
print("Recall: {}".format(recall_score(rating_test, rating_pred_nb, average='weighted')))
print("f1_score: {}".format(f1_score(rating_test, rating_pred_nb, average='weighted')))

##### create and fit a SVC
svc_reg = LinearSVC()
svc_reg.fit(features_train, rating_train)
rating_pred_svc = svc_reg.predict(features_test)

print("Scores SVC:")
#compute the confusion matrix
# cnf_matrix = confusion_matrix(rating_test, rating_pred_svc)
# print(cnf_matrix)

#compute accuracy score
print("Accuracy: {}".format(accuracy_score(rating_test, rating_pred_svc)))
print("Precision: {}".format(precision_score(rating_test, rating_pred_svc, average='weighted')))
print("Recall: {}".format(recall_score(rating_test, rating_pred_svc, average='weighted')))
print("f1_score: {}".format(f1_score(rating_test, rating_pred_svc, average='weighted')))

#plot the confusion matrix
#plot_confusion_matrix(cnf_matrix, classes=lab_enc.classes_, title='KNN Classifier')

print()

# metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))



Scores Random Forest:
Accuracy: 0.009311606252078483
Precision: 0.00930982512706723
Recall: 0.009311606252078483
f1_score: 0.00875226437338114


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Scores knn:
Accuracy: 0.010309278350515464
Precision: 0.0051446110623426635
Recall: 0.010309278350515464
f1_score: 0.006238776125480133


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Scores Decision Tree:
Accuracy: 0.008313934153641503
Precision: 0.008655934006509264
Recall: 0.008313934153641503
f1_score: 0.0081387459246287


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Scores Naive Bayes:
Accuracy: 0.0029930162953109413
Precision: 0.005010749114532075
Recall: 0.0029930162953109413
f1_score: 0.0016524254020128335


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Scores SVC:
Accuracy: 0.0019953441968739607
Precision: 0.04099032698615306
Recall: 0.0019953441968739607
f1_score: 0.000822878262895502



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [7]:
# # create and fit a knn classifier
# knn_reg = KNeighborsClassifier()

# knn_reg.fit(features_train, rating_train)
# rating_pred_knn = knn_reg.predict(features_test)

# print("Scores knn:")
# #compute the confusion matrix
# cnf_matrix = confusion_matrix(rating_test, rating_pred_knn)
# print(cnf_matrix)


# # #compute accuracy score

# # accuracy_knn = cross_val_score(knn_reg, features_train, rating_train, cv=cross_val, scoring='accuracy')

# # for i, acc in enumerate(accuracy_knn):
# #     print("Fold {}: Accuracy = {}%".format(i, acc * 100.0))

# # print("Average Accuracy = {}%".format(accuracy_knn.mean() * 100.0))
# # # print("Accuracy: {}".format(accuracy_score(rating_test, rating_pred_knn)))
# # # print("Precision: {}".format(precision_score(rating_test, rating_pred_knn, average='weighted')))
# # # print("Recall: {}".format(recall_score(rating_test, rating_pred_knn, average='weighted')))
# # # print("f1_score: {}".format(f1_score(rating_test, rating_pred_knn, average='weighted')))

# # #Tuning of algorithm

# # # specify the parameter grid
# # parameters = {
# #     'n_neighbors': range(2, 10)
# # }


# # # create the grid search instance
# # grid_search_estimator = GridSearchCV(knn_reg, parameters, scoring='accuracy', cv=stratified_10_fold_cv, return_train_score=False)

# # # run the grid search
# # grid_search_estimator.fit(features_train, rating_train)

# # # print the results of all hyper-parameter combinations
# # results = pd.DataFrame(grid_search_estimator.cv_results_)
# # #display(results)
    
# # # print the best parameter setting
# # print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

In [8]:
# ##### create and fit a RandomForestClassifier
# rf_reg = RandomForestClassifier()
# rf_reg.fit(features_train, rating_train)
# rating_pred_rf = rf_reg.predict(features_test)

# print("Scores knn:")
# #compute the confusion matrix
# cnf_matrix = confusion_matrix(rating_test, rating_pred_rf)
# print(cnf_matrix)

# #compute accuracy score
# print("Accuracy: {}".format(accuracy_score(rating_test, rating_pred_rf)))
# print("Precision: {}".format(precision_score(rating_test, rating_pred_rf, average='weighted')))
# print("Recall: {}".format(recall_score(rating_test, rating_pred_rf, average='weighted')))
# print("f1_score: {}".format(f1_score(rating_test, rating_pred_rf, average='weighted')))

# #plot the confusion matrix
# #plot_confusion_matrix(cnf_matrix, classes=lab_enc.classes_, title='KNN Classifier')

# #Tuning of algorithm

# # specify the parameter grid
# parameters = {
#     'max_depth':[1,3,5,10]
# }


# # create the grid search instance
# grid_search_estimator = GridSearchCV(rf_reg, parameters, scoring='accuracy', cv=stratified_10_fold_cv, return_train_score=False)

# # run the grid search
# grid_search_estimator.fit(features_train, rating_train)

# # print the results of all hyper-parameter combinations
# results = pd.DataFrame(grid_search_estimator.cv_results_)
# #display(results)
    
# # print the best parameter setting
# print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))




# print()
