In [2]:
import pandas as pd
from apyori import apriori
from sklearn.compose import make_column_selector 
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer 
from sklearn.compose import ColumnTransformer 

movies_data = pd.read_csv("movies/movies.csv",  encoding='latin-1')

# Leer datos de un csv
movies_data

Unnamed: 0,id,budget,genres,homePage,productionCompany,productionCompanyCountry,productionCountry,revenue,runtime,video,...,popularity,releaseDate,voteAvg,voteCount,genresAmount,productionCoAmount,productionCountriesAmount,actorsAmount,castWomenAmount,castMenAmount
0,5,4000000,Crime|Comedy,https://www.miramax.com/movie/four-rooms/,Miramax|A Band Apart,US|US,United States of America,4257354.0,98,False,...,20.880,1995-12-09,5.7,2077,2,2,1,25,15,9
1,6,21000000,Action|Thriller|Crime,,Universal Pictures|Largo Entertainment|JVC,US|US|JP,Japan|United States of America,12136938.0,110,False,...,9.596,1993-10-15,6.5,223,3,3,2,15,3,9
2,11,11000000,Adventure|Action|Science Fiction,http://www.starwars.com/films/star-wars-episod...,Lucasfilm|20th Century Fox,US|US,United States of America,775398007.0,121,,...,100.003,1977-05-25,8.2,16598,3,2,1,105,5,62
3,12,94000000,Animation|Family,http://movies.disney.com/finding-nemo,Pixar,US,United States of America,940335536.0,100,,...,134.435,2003-05-30,7.8,15928,2,1,1,24,5,18
4,13,55000000,Comedy|Drama|Romance,,Paramount|The Steve Tisch Company,US|,United States of America,677387716.0,142,False,...,58.751,1994-07-06,8.5,22045,3,2,1,76,18,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,920081,0,Action|Horror,,,,,0.0,100,False,...,16.662,2021-11-26,6.8,108,2,1,1,10,2,4
9996,920143,0,Comedy,,Caracol Televisiï¿½n|Dago Garcï¿½a Producciones,CO|CO,Colombia,0.0,97,False,...,491.706,2021-12-25,1.5,2,1,2,1,8,1,1
9997,922017,0,Comedy,,,,Nigeria,0.0,112,False,...,565.658,2021-12-17,6.1,30,1,1,17,1,0,922017
9998,922162,0,,https://www.netflix.com/title/81425229,,,United States of America,0.0,59,False,...,9.664,2021-12-17,6.0,1,1,0,0,0,922162,The Witcher: Fireplace


# Transformacion de datos

In [8]:
# Budget a datos cualitativos
def categorize_budget(budget):
    if budget < 10_000_000:
        return "Low"
    elif budget < 50_000_000:
        return "Medium"
    elif budget < 150_000_000:
        return "High"
    else:
        return "Very High"

# revenue a datos cualitativos
def categorize_revenue(revenue):
    if revenue < 10_000_000:
        return "Low"
    elif revenue < 100_000_000:
        return "Medium"
    elif revenue < 500_000_000:
        return "High"
    else:
        return "Very High"

# Runtime a datos cualitativos
def categorize_runtime(runtime):
    if runtime < 90:
        return "Short"
    elif runtime < 120:
        return "Medium"
    else:
        return "Long"

# Avg a datos cualitativos
def categorize_vote_avg(vote_avg):
    if vote_avg < 5:
        return "Poor"
    elif vote_avg < 7:
        return "Average"
    elif vote_avg < 8.5:
        return "Good"
    else:
        return "Excellent"

# vote count a datos cualitativos
def categorize_vote_count(vote_count):
    if vote_count < 1000:
        return "Low"
    elif vote_count < 10000:
        return "Medium"
    else:
        return "High"

# popularity a datos cualitativos
def categorize_popularity(popularity):
    if popularity < 10:
        return "Low"
    elif popularity < 50:
        return "Medium"
    elif popularity < 100:
        return "High"
    else:
        return "Very High"

df_transformed = movies_data.copy()
df_transformed["budget"] = movies_data["budget"].apply(pd.to_numeric, errors="coerce").apply(categorize_budget)
df_transformed["revenue"] = movies_data["revenue"].apply(pd.to_numeric, errors="coerce").apply(categorize_revenue)
df_transformed["runtime"] = movies_data["runtime"].apply(pd.to_numeric, errors="coerce").apply(categorize_runtime)
df_transformed["voteAvg"] = movies_data["voteAvg"].apply(pd.to_numeric, errors="coerce").apply(categorize_vote_avg)
df_transformed["voteCount"] = movies_data["voteCount"].apply(pd.to_numeric, errors="coerce").apply(categorize_vote_count)
df_transformed["popularity"] = movies_data["popularity"].apply(pd.to_numeric, errors="coerce").apply(categorize_popularity)

movies_transformed = df_transformed[["title", "genres", "productionCompany", "productionCountry", "budget", "revenue", "runtime", "voteAvg", "voteCount", "popularity"]]

movies_transformed = movies_transformed.dropna()

movies_transformed

Unnamed: 0,title,genres,productionCompany,productionCountry,budget,revenue,runtime,voteAvg,voteCount,popularity
0,Four Rooms,Crime|Comedy,Miramax|A Band Apart,United States of America,Low,Low,Medium,Average,Medium,Medium
1,Judgment Night,Action|Thriller|Crime,Universal Pictures|Largo Entertainment|JVC,Japan|United States of America,Medium,Medium,Medium,Average,Low,Low
2,Star Wars,Adventure|Action|Science Fiction,Lucasfilm|20th Century Fox,United States of America,Medium,Very High,Long,Good,High,Very High
3,Finding Nemo,Animation|Family,Pixar,United States of America,High,Very High,Medium,Good,High,Very High
4,Forrest Gump,Comedy|Drama|Romance,Paramount|The Steve Tisch Company,United States of America,High,Very High,Long,Excellent,High,High
...,...,...,...,...,...,...,...,...,...,...
9988,Death to 2021,Comedy,Broke and Bones|Jesse Collins Entertainment|Ir...,United Kingdom|United States of America,Low,Low,Short,Average,Low,Medium
9991,Sexo explï¿½cito,Drama,Malvalanda,Spain,Low,Low,Short,Poor,Low,Medium
9992,Operation Merry Christmas: The Elf Con,Comedy|Crime,Elefantec Global|MarVista Entertainment|Partic...,United States of America|Mexico,Low,Low,Short,Good,Low,Very High
9996,El Paseo 6,Comedy,Caracol Televisiï¿½n|Dago Garcï¿½a Producciones,Colombia,Low,Low,Medium,Poor,Low,Very High
