In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import math

In [None]:
def prediccion(y_test,y_pred):
  validos=0
  no_validos=0
  for i in range(len(y_pred)):
    if y_test.iloc[i]==y_pred[i].astype(int):
      validos=validos+1
    else:
      no_validos=no_validos+1
  return(print("Validos: ",validos," / No validos: ",no_validos))

In [None]:
def mostrar_resultados(y_test, pred_y):
    conf_matrix = confusion_matrix(y_test, pred_y)
    plt.figure(figsize=(12, 12))
    sns.heatmap(conf_matrix, annot=True, fmt="d");
    plt.title("Matriz de confusión")
    plt.ylabel('Realidad')
    plt.xlabel('Predicción')
    plt.show()
    print()
    print(classification_report(y_test, pred_y))

In [None]:
df = pd.read_csv('movies_list_2012-2020.csv')
df_original=df

FileNotFoundError: ignored

In [None]:
df.head()
df_models_score=df_models_score=pd.DataFrame()

In [None]:
# TOP 50 movies opening weekend
top_movies_opening_weekend = df[["movie_title", "score"]].sort_values(by="score", ascending=False)
top_movies_opening_weekend[:50]

plt.figure(figsize=(20, 15))
sns.barplot(
    y = "movie_title",
    x = "score",
    data = top_movies_opening_weekend[:50]
)
plt.ylabel("Opening weekend gross")
plt.xlabel("Movie title")

plt.show()

# Pre-analysis

In [None]:
df = df.drop(columns=['release_date', 'movie_title'], axis=0)

## Differents values

In [None]:
df['max_theaters'].describe()

In [None]:
df['score'].describe()

In [None]:
df['running_time_min'].describe()

In [None]:
for col in df.columns.to_list():
  if col.startswith('genre_'):
    print(f'{col}: {df[col].value_counts()[1]}/{len(df)}')

## Outliers

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.boxplot(data=df)
_ = ax.set_xticklabels(df.keys(), rotation=90)

## Correlation

In [None]:
df.head()

In [None]:
plt.figure(figsize=(15, 8))
sns.heatmap(round(df.corr(method='spearman'), 2), annot=True, mask=None)
plt.show()

# Models

1. Predict Score (without total gross)
2. Total gross (without score)
3. Opening weekend gross (without total gross and score)

In [None]:
from sklearn.model_selection import train_test_split

## Predict Score


### Logistic Regression

In [None]:
df["score"].describe()

In [None]:
df.loc[df["score"] < 4.8, "score"] = 0
df.loc[(df["score"] >=4.8) & (df["score"] < 5.5), "score"] = 1
df.loc[(df["score"] >=5.5) & (df["score"] < 6.3), "score"] = 2
df.loc[(df["score"] >=6.3), "score"] = 3

In [None]:
df["score"].value_counts()

In [None]:
notas=["Bad/<6","Good/6-7","Very Good/7-8","Excelent/+8"]
data=df

In [None]:
from sklearn.preprocessing import MinMaxScaler
y=df["score"]
cols = list(df.columns)
scaler = MinMaxScaler()

df= scaler.fit_transform(df)
df = pd.DataFrame(df, columns=cols)

In [None]:
df.head()

In [None]:
df_log = df.drop(columns=['score', 'gross_total'])
X = df_log
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1000,multi_class="multinomial").fit(X_train, y_train)
y_pred=clf.predict(X_test)

In [None]:
score = clf.score(X_test, y_test)
print(score)
df_models_score=df_models_score=df_models_score=df_models_score.append({'name_model': 'Logistic Regression', 'score': round(score*100,2),'target': 'score'},ignore_index=True)

In [None]:
prediccion(y_test,y_pred)

In [None]:
mostrar_resultados(y_test,y_pred)

### Random Forest Classifier


In [None]:
rfc = RandomForestClassifier(n_estimators=2000,criterion="gini")
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
score = rfc.score(X_test, y_test)
print(score)
df_models_score=df_models_score.append({'name_model': 'Random Forest Classifier', 'score':round(score*100,2),'target': 'score'},ignore_index=True)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rfc))

In [None]:
prediccion(y_test,y_pred_rfc)

In [None]:
mostrar_resultados(y_test,y_pred_rfc)

### Decision Tree Classifier

In [None]:
from sklearn import tree
dtc = tree.DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=10,splitter="best")   
#Entrenamos
clf = dtc.fit(X_train, y_train)

In [None]:
y_pred_dtc=dtc.predict(X_test)

In [None]:
score=dtc.score(X_test,y_test)
df_models_score=df_models_score.append({'name_model': 'Decision Tree Classifier', 'score':round(score*100,2),'target': 'score'},ignore_index=True)

In [None]:
prediccion(y_test,y_pred_dtc)

In [None]:
mostrar_resultados(y_test,y_pred_dtc)


### PCA (DTC)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=12)
principalComponents = pca.fit_transform(df_log)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(principalComponents, y, test_size=0.20, random_state=42)

In [None]:
from sklearn import tree
dtc_pca = tree.DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=10,splitter="best")   
#Entrenamos
dtc_pca.fit(X_train, y_train)
y_pred_dtc_pca=dtc_pca.predict(X_test)

In [None]:
score=dtc_pca.score(X_test,y_test)
df_models_score=df_models_score.append({'name_model': 'Decision Tree Classifier with PCA', 'score': round(score*100,2),'target': 'score'},ignore_index=True)

In [None]:
mostrar_resultados(y_test,y_pred_dtc_pca)

### Naive Bayes

In [None]:
df_nb = data.drop(columns=['score', 'gross_total','max_theaters'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_nb, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred_gnb= gnb.predict(X_test)
score=gnb.score(X_test,y_test)
df_models_score=df_models_score.append({'name_model': 'Naive Bayes', 'score': round(score*100,2),'target': 'score'},ignore_index=True)

In [None]:
prediccion(y_test,y_pred_gnb)

In [None]:
mostrar_resultados(y_test,y_pred_gnb)

### KNeighbors classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=4)
knc.fit(X_train, y_train)
y_pred_knc = knc.predict(X_test)
score = knc.score(X_test, y_test)
print(score)
df_models_score=df_models_score.append({'name_model': 'KNeighbors Classifier', 'score':round(score*100,2),'target': 'score'},ignore_index=True)

In [None]:
prediccion(y_test,y_pred_knc)

In [None]:
mostrar_resultados(y_test,y_pred_knc)

### Linear SVC

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn import svm
lsvc = svm.SVC(kernel='poly')

In [None]:
lsvc.fit(X_train,y_train)

In [None]:
y_pred_lsvc=lsvc.predict(X_test)
score=lsvc.score(X_test,y_test)
df_models_score=df_models_score.append({'name_model': 'Linear SVC', 'score': round(score*100,2),'target': 'score'},ignore_index=True)

In [None]:
prediccion(y_test,y_pred_lsvc)

In [None]:
mostrar_resultados(y_test,y_pred_lsvc)

### Neural network

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers


In [None]:
model = Sequential()
model.add(layers.Dense(30, input_dim = X_train.shape[1], activation = 'relu')) # input layer requires input_dim param
model.add(layers.Dense(15, activation = 'relu'))
model.add(layers.Dense(1, activation='softmax'))

model.compile(loss="categorical_crossentropy", optimizer= "adam", metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs = 100, shuffle = True, batch_size=128, verbose=2)

In [None]:
score = model.evaluate(X_test, y_test)
score=score[1]
df_models_score=df_models_score.append({'name_model': 'Neural network Classification', 'score': round(score*100,2),'target': 'score'},ignore_index=True)

In [None]:
df_models_score["score"]=df_models_score["score"].astype(str)+"%"

### Results

In [None]:
df_models_score.to_csv("/content/df_score.csv",index=False)

In [None]:
df_models_score