In [70]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np

with open("movies_dataset/transformed_training.csv", "r") as f:
    df = pd.read_csv(f)

In [71]:
##Prediccion de la media de votos

#Eliminacion de columnas no conocidas y separacion en entrenamiento y validacion
df_votes = df.drop(["vote_count", "revenue", "id"], axis=1)
train_votes, val_votes = train_test_split(df_votes, test_size=0.2, random_state=0)


#Obtencion de las columnas mas correlacionadas
mtx_corr = train_votes.corr()
minimum_correlation_votes = 0  #Todas las features
revenue_correlations_votes = mtx_corr["vote_average"][abs(mtx_corr["vote_average"]) > minimum_correlation_votes]

print(revenue_correlations_votes)

selected_columns_votes = list(revenue_correlations_votes.index)
selected_columns_votes.remove("vote_average")


#Seleccion de features y objetivos
features_train_votes = train_votes[selected_columns_votes]
goal_train_votes = train_votes[["vote_average"]]

features_val_votes = val_votes[selected_columns_votes]
goal_val_votes = val_votes[["vote_average"]]


#Seleccion del modelo y optimizacion de hiperparametros
model_votes = RandomForestRegressor(random_state=0)
model_votes.fit(features_train_votes, goal_train_votes)


#Prueba del modelo sobre ambos conjuntos para ver el error
predictions_train_votes = model_votes.predict(features_train_votes)
predictions_val_votes = model_votes.predict(features_val_votes)

MAE_train_votes = mean_absolute_error(goal_train_votes, predictions_train_votes)
MAE_val_votes = mean_absolute_error(goal_val_votes, predictions_val_votes)

print()
print("Training error:", MAE_train_votes)
print("Validation error:", MAE_val_votes)

belongs_to_collection   -0.029149
budget                   0.009831
original_language        0.100948
production_companies     0.028355
runtime                  0.302668
                           ...   
tagline_time            -0.001599
tagline_get             -0.032473
tagline_back            -0.034705
tagline_come             0.005656
tagline_he              -0.027471
Name: vote_average, Length: 103, dtype: float64


  model_votes.fit(features_train_votes, goal_train_votes)



Training error: 0.23897669671712868
Validation error: 0.6482625850340138


In [72]:
##Prediccion de la recaudacion

#Eliminacion de columnas no conocidas y separacion en entrenamiento y validacion
df_revenue = df.drop(["vote_count", "vote_average", "id"], axis=1)
train_revenue, val_revenue = train_test_split(df_revenue, test_size=0.2, random_state=0)


#Obtencion de las columnas mas correlacionadas
mtx_corr = train_revenue.corr()
minimum_correlation_revenue = 0.01 #Con correlacion mayor que 0.01
revenue_correlations_revenue = mtx_corr["revenue"][abs(mtx_corr["revenue"]) > minimum_correlation_revenue]

print(revenue_correlations_revenue)

selected_columns_revenue = list(revenue_correlations_revenue.index)
selected_columns_revenue.remove("revenue")


#Seleccion de features y objetivos
features_train_revenue = train_revenue[selected_columns_revenue]
goal_train_revenue = train_revenue[["revenue"]]

features_val_revenue = val_revenue[selected_columns_revenue]
goal_val_revenue = val_revenue[["revenue"]]


#Seleccion del modelo y optimizacion de hiperparametros
model_revenue = RandomForestRegressor(random_state=0)
model_revenue.fit(features_train_revenue, goal_train_revenue)


#Prueba del modelo sobre ambos conjuntos para ver el error
predictions_train_revenue = model_revenue.predict(features_train_revenue)
predictions_val_revenue = model_revenue.predict(features_val_revenue)

MAE_train_revenue = mean_absolute_error(goal_train_revenue, predictions_train_revenue)
MAE_val_revenue = mean_absolute_error(goal_val_revenue, predictions_val_revenue)

print()
print("Training error:", MAE_train_revenue)
print("Validation error:", MAE_val_revenue)

# 0.05 40M
# 0.01 39.5M
# 0.1  39.6M

belongs_to_collection    0.146377
budget                   0.740640
original_language       -0.113886
production_companies     0.235580
revenue                  1.000000
                           ...   
tagline_world            0.044192
tagline_time             0.024769
tagline_get              0.019880
tagline_back             0.024159
tagline_come             0.039175
Name: revenue, Length: 71, dtype: float64


  model_revenue.fit(features_train_revenue, goal_train_revenue)



Training error: 16139653.178137437
Validation error: 40569964.91020408


In [98]:
### SECCIÓN DE APLICACIÓN DEL MODELO

predecir_dict = {
    19995 : "Avatar",
    597 : "Titanic",
    109445 : "Frozen",
    862 : "Toy Story 1",
    863 : "Toy Story 2",
    27205 : "Inception"
}

predecir = df[df["id"].isin(predecir_dict)]

predecir_votes = predecir[selected_columns_votes]
predecir_revenue = predecir[selected_columns_revenue]

predictions_votes = model_votes.predict(predecir_votes)
predictions_revenue = model_revenue.predict(predecir_revenue)

results = pd.DataFrame({
    "Pelicula" : list(predecir_dict.values()),
    "Prediccion votacion" : predictions_votes,
    "Votacion real" :  predecir["vote_average"],
    "Prediccion recaudacion" : predictions_revenue,
    "Recaudacion real" : predecir["revenue"]
})

results

Unnamed: 0,Pelicula,Prediccion votacion,Votacion real,Prediccion recaudacion,Recaudacion real
0,Avatar,7.136,7.7,290068500.0,373554000.0
750,Titanic,7.336,7.5,1381986000.0,1845034000.0
1434,Frozen,6.945,7.3,407592300.0,497366900.0
4605,Toy Story 1,7.11,7.2,2227342000.0,2787965000.0
4746,Toy Story 2,7.791,8.1,684789700.0,825532800.0
5779,Inception,6.872,7.3,965839400.0,1274219000.0
