In [284]:
import pandas as pd
import re
import numpy as np

In [94]:
df = pd.read_pickle("movies_data_new.pkl")

In [95]:
df.shape

(10115, 27)

In [96]:
df["content_rating"].value_counts()

R            3539
PG-13        1917
             1881
NOT RATED    1087
PG            867
UNRATED       273
G             259
TV-14          89
TV-MA          85
TV-PG          62
TV-G           28
NC-17          19
TV-Y7           4
TV-Y            3
APPROVED        1
E               1
Name: content_rating, dtype: int64

In [97]:
discard = set(["E","APPROVED","TV"])
df = df[df["content_rating"].apply(lambda x: False if "TV" in x else (x not in discard))]
df["content_rating"].value_counts()

R            3539
PG-13        1917
             1881
NOT RATED    1087
PG            867
UNRATED       273
G             259
NC-17          19
Name: content_rating, dtype: int64

In [98]:
df["content_rating"] = df["content_rating"].apply(lambda x: "UNKOWN" if x in set(["NOT RATED", "UNRATED",""]) else x)
df["content_rating"].value_counts()

R         3539
UNKOWN    3241
PG-13     1917
PG         867
G          259
NC-17       19
Name: content_rating, dtype: int64

In [104]:
((df["release_date"]!="") & (df["year"]=="")).sum()
#df["year2"] = df["year"].apply(lambda x: (int(x)/5)*5 if x else 0)

18

In [110]:
for index,row in df.iterrows():
    year = row["year"]
    date = row["release_date"]
    if year or date:
        if year:
            df.loc[index, "year2"] = int(year)
        else:
            df.loc[index, "year2"] = int(date.split("-")[0])
    else: 
        df.loc[index, "year2"] = 0


In [123]:
# movies released >= 1990
df = df[df["year2"]>=1990]

In [124]:
df["year_binned"] = df["year2"].apply(lambda x: int(x/5)*5)

In [129]:
df.columns

Index(['movieId', 'imdbId', 'tmdbId', 'data', 'title', 'content_rating',
       'original_title', 'metadata', 'release_date', 'director', 'url', 'year',
       'trailer', 'length', 'cast', 'imdb_id', 'rating', 'genre',
       'rating_count', 'storyline', 'description', 'writers', 'stars',
       'poster', 'budget', 'gross', 'languages', 'year2', 'year_binned'],
      dtype='object')

In [137]:
# filter by valid storyline
df = df[df["storyline"]!=""]
# filter by valid description
df = df[df["description"]!=""]

In [138]:
df.shape

(9131, 29)

In [146]:
all_genres = set()
df["genre"].apply(lambda x: [all_genres.add(j) for j in x]).head()

0    [None, None, None]
1    [None, None, None]
2          [None, None]
3    [None, None, None]
4    [None, None, None]
Name: genre, dtype: object

In [221]:
all_lang = set()
df["languages"].apply(lambda x: [all_lang.add(j) for j in x]).head()

0          [None]
1    [None, None]
2          [None]
3          [None]
4          [None]
Name: languages, dtype: object

In [224]:
len(all_lang)

174

In [222]:
df["all_languages"] = df["languages"].apply(lambda x: list(all_lang))

In [151]:
df.columns

Index(['movieId', 'imdbId', 'tmdbId', 'data', 'title', 'content_rating',
       'original_title', 'metadata', 'release_date', 'director', 'url', 'year',
       'trailer', 'length', 'cast', 'imdb_id', 'rating', 'genre',
       'rating_count', 'storyline', 'description', 'writers', 'stars',
       'poster', 'budget', 'gross', 'languages', 'year2', 'year_binned',
       'all_genres'],
      dtype='object')

In [159]:
df["rating"] = df["rating"].apply(float)

In [166]:
df["rating_count"] = df["rating_count"].apply(lambda x: x.replace(",","")).apply(int)

In [169]:
df["length"] = df["length"].apply(lambda x: int(x) if x else 0)

In [209]:
x = '&#8364;90,000              (estimated)'
re.findall("([\d,]+)",x)[-1]

'90,000'

In [207]:
df["budget"].loc[27272]

'&#8364;90,000              (estimated)'

In [214]:
df["budget"] = df["budget"].apply(lambda x: re.findall("([\d,]+)",x)[-1] if re.findall("([\d,]+)",x) else x).apply(lambda x: x.replace(",","")).apply(lambda x: int(x) if x else 0)

In [217]:
df["gross"] = df["gross"].apply(lambda x: re.findall("([\d,]+)",x)[0] if re.findall("([\d,]+)",x) else x).apply(lambda x: x.replace(",","")).apply(lambda x: int(x) if x else 0)

In [223]:
df.head().T

Unnamed: 0,0,1,2,3,4
movieId,1,2,3,4,5
imdbId,114709,113497,113228,114885,113041
tmdbId,862,8844,15602,31357,11862
data,"{'title': 'Toy Story', 'content_rating': 'G', ...","{'title': 'Jumanji', 'content_rating': 'PG', '...","{'title': 'Grumpier Old Men', 'content_rating'...","{'title': 'Waiting to Exhale', 'content_rating...","{'title': 'Father of the Bride Part II', 'cont..."
title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II
content_rating,G,PG,PG-13,R,PG
original_title,,,,,
metadata,"{'languages': ['English'], 'asp_retio': '1.85 ...","{'languages': ['English', 'French'], 'asp_reti...","{'languages': ['English'], 'asp_retio': '1.85 ...","{'languages': ['English'], 'asp_retio': '1.85 ...","{'languages': ['English'], 'asp_retio': '1.85 ..."
release_date,1995-11-22,1995-12-15,1995-12-22,1995-12-22,1995-12-08
director,John Lasseter,Joe Johnston,Howard Deutch,Forest Whitaker,Charles Shyer


In [252]:
df = df[(df["genre"].apply(len))>0]

In [253]:
df_preprocessed = df[["movieId","imdbId","title","storyline","description","genre","year2","year_binned","rating","rating_count","length","budget","gross","languages"]]

In [254]:
df_preprocessed.to_csv("movies_data_preprocessed.csv",index=None)
df_preprocessed.to_pickle("movies_data_preprocessed.pkl")

In [255]:
df_preprocessed.head()

Unnamed: 0,movieId,imdbId,title,storyline,description,genre,year2,year_binned,rating,rating_count,length,budget,gross,languages
0,1,114709,Toy Story,A little boy named Andy loves to be in his roo...,A cowboy doll is profoundly threatened and jea...,"[Animation, Adventure, Comedy]",1995,1995,8.3,701041,81,30000000,191796233,[English]
1,2,113497,Jumanji,After being trapped in a jungle board game for...,When two kids find and play a magical board ga...,"[Action, Adventure, Family]",1995,1995,6.9,234752,104,50000000,100475249,"[English, French]"
2,3,113228,Grumpier Old Men,Things don't seem to change much in Wabasha Co...,John and Max resolve to save their beloved bai...,"[Comedy, Romance]",1995,1995,6.6,20854,101,25000000,69870000,[English]
3,4,114885,Waiting to Exhale,This story based on the best selling novel by ...,"Based on Terry McMillan's novel, this film fol...","[Comedy, Drama, Romance]",1995,1995,5.7,8025,124,16000000,67052156,[English]
4,5,113041,Father of the Bride Part II,"In this sequel to ""Father of the Bride"", Georg...",George Banks must deal not only with the pregn...,"[Comedy, Family, Romance]",1995,1995,5.9,28834,106,30000000,76594000,[English]


In [258]:
all_genres = set()
df = df_preprocessed

df["genre"].apply(lambda y: [all_genres.add(x) for x in y]).shape

(9127,)

In [268]:
genres_map = dict()
i=0
for g in all_genres:
    if not g in genres_map:
        genres_map[g] = i
        i+=1
        

In [275]:
def map_genres(genres):
    t = [0]*len(all_genres)
    for g in genres:
        t[genres_map[g]]=1
    return t

df["genre_encoded"] = df["genre"].apply(map_genres)

In [291]:
all_years = set(df["year_binned"].unique())
year_map = dict()
i=0
for y in all_years:
    if not y in year_map:
        year_map[y] = i
        i+=1
        
def map_year(y):
    t = [0]*len(all_years)
    t[year_map[y]]=1
    return t

In [292]:
df["year_encoded"] = df["year_binned"].apply(map_year)

In [295]:
df.index = df.movieId

In [308]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(stop_words = 'english',max_features=200)
text_features = tfidf_vect.fit_transform(df['description'].values)
text_features_array = text_features.toarray()

In [309]:
i=0
def tf_idf_score(text):
    global i
    temp = text_features_array[i]
    i+=1
    return temp
df["description_encoded"] = df["description"].apply(tf_idf_score)

In [317]:
df_features = df[["movieId","description_encoded","genre_encoded","year_encoded"]]

In [318]:
df_features.to_csv("movie_features.csv")
df_features.to_pickle("movie_features.pkl")

In [314]:
def get_features(movieid):
    row = df_features.loc[movieid]
    desc_tfidf, year, genre = row["description_encoded"], row["year_encoded"], row["genre_encoded"]
    return desc_tfidf, year, genre

get_features(1)


(array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.78649931,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 