<a href="https://colab.research.google.com/github/khamkarajinkya/Recommender-Practice/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from google.colab import drive 
import logging
import string as st
import re
import nltk
from nltk import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
#mount drive
drive.mount('/content/gdrive')
base_path = '/content/gdrive/My Drive/Data/movie-lens/'

Mounted at /content/gdrive


In [None]:
'''
Count the number of unique tokens for each feature

'''

feature_count = {}

#ratings df
rating = pd.read_csv(base_path + 'full/rating.csv')
feature_count['userId']  = rating.userId.nunique()
feature_count['movieId'] = rating.movieId.nunique()

#genome tags
genome_tags = pd.read_csv(base_path + 'full/genome_tags.csv')
feature_count['tagId'] = genome_tags.tagId.nunique()

#genres
movie = pd.read_csv(base_path + 'full/movies.csv')
feature_count['genres'] = len(set([item for sublist in movie.genres.apply(lambda x : x.split('|')).tolist() for item in sublist]))

In [None]:
movies = pd.read_csv(base_path + 'full/genome_tags.csv')

In [None]:
movie = pd.read_csv(base_path + 'full/movie.csv')

In [None]:
movie[['title','year']] = movie.title.apply(lambda x : extract_date_from_title(x))

In [15]:
movie = pd.read_csv(base_path + 'filtered/title.principals.csv')

In [17]:
#movie = movie[movie['category'].notna()]
#movie.groupby('tconst')[['nconst','category']].agg(list)

movie.category.unique()

array(['self', 'director', 'editor', 'actor', 'writer', 'producer',
       'composer', 'cinematographer', 'actress', 'production_designer',
       'archive_footage', 'archive_sound'], dtype=object)

In [34]:
def extract_index 

Unnamed: 0,movieId,tagId,relevance,tag
0,1,1036,0.99925,[toy]
1,2,29,0.98100,[adventur]
2,3,451,0.97450,"[good, sequel]"
3,4,1116,0.97675,[woman]
4,5,451,0.96575,"[good, sequel]"
...,...,...,...,...
10376,130578,82,0.88325,[assassin]
10377,130840,863,0.96500,[romanc]
10378,131013,230,0.98425,[comedi]
10379,131168,128,0.97300,[betray]


In [58]:

#Build feature set 

'''
1. extract year from movie title
2. convert genres into a list, set missing values to `miss_genre`
'''
movie = pd.read_csv(base_path + 'full/movie.csv')
movie[['title','year']] = movie.title.apply(lambda x : extract_date_from_title(x))
movie['genres'] = movie.genres.apply(lambda x : 'miss_genre' if x == '(no genres listed)' else x)
movie['genres'] = movie.genres.apply(lambda x : x.split('|'))

'''
1. Identify relevant tags per movie
2. Normalize and standardize movie-tags
3. join movies with normalized tags
'''
tag_score = pd.read_csv(base_path + 'full/genome_scores.csv')
tag_score = tag_score.loc[tag_score.groupby("movieId")["relevance"].idxmax()]
tag = pd.read_csv(base_path + 'full/genome_tags.csv')
tag['tag'] = tag.tag.apply(lambda x : apply_text_filter(x))
tag_score = pd.merge(tag_score, tag, on = 'tagId', how = 'left')
tag_score = tag_score[['movieId','tag']]
'''
1. Identify the actor
2. Identify the actress
3. Identify the editor
4. Identify the producter
5. Identify the composer
6. Identify the cinematographer
'''
characters = pd.read_csv(base_path + 'filtered/title.principals.csv')

actor   = characters[characters['category'] == 'actor'].groupby('tconst')['nconst'].agg(list).reset_index().rename(columns = {'nconst':'actor'})
actress = characters[characters['category'] == 'actress'].groupby('tconst')['nconst'].agg(list).reset_index().rename(columns = {'nconst':'actress'})
editor  = characters[characters['category'] == 'editor'].groupby('tconst')['nconst'].agg(list).reset_index().rename(columns = {'nconst':'editor'})
producer= characters[characters['category'] == 'producer'].groupby('tconst')['nconst'].agg(list).reset_index().rename(columns = {'nconst':'producer'})
composer= characters[characters['category'] == 'composer'].groupby('tconst')['nconst'].agg(list).reset_index().rename(columns = {'nconst':'composer'})
cinematographer= characters[characters['category'] == 'cinematographer'].groupby('tconst')['nconst'].agg(list).reset_index().rename(columns = {'nconst':'cinematographer'})

'''
Get director and writer information
'''

movie = pd.merge(movie, tag_score, on = 'movieId', how = 'left')
movie = pd.merge(movie, actor,     left_on = 'movieId', right_on = 'tconst',how = 'left').drop(['tconst'],axis = 1)
movie = pd.merge(movie, actress,   left_on = 'movieId', right_on = 'tconst',how = 'left').drop(['tconst'],axis = 1)
movie = pd.merge(movie, producer,  left_on = 'movieId', right_on = 'tconst',how = 'left').drop(['tconst'],axis = 1)
movie = pd.merge(movie, editor,    left_on = 'movieId', right_on = 'tconst',how = 'left').drop(['tconst'],axis = 1)
movie = pd.merge(movie, composer,  left_on = 'movieId', right_on = 'tconst',how = 'left').drop(['tconst'],axis = 1)
movie = pd.merge(movie, cinematographer,  left_on = 'movieId', right_on = 'tconst',how = 'left').drop(['tconst'],axis = 1)

movie


Unnamed: 0,movieId,title,genres,year,tag,actor,actress,producer,editor,composer_x,composer_y
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,[toy],,,,,,
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,[adventur],,,,,,
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,"[good, sequel]",,,,,,
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,[woman],,,,,,
4,5,Father of the Bride Part II,[Comedy],1995,"[good, sequel]","[nm0443482, nm0653042]",,[nm0249379],,,
...,...,...,...,...,...,...,...,...,...,...,...
27273,131254,Kein Bund für's Leben,[Comedy],2007,,,,,,,
27274,131256,"Feuer, Eis & Dosenbier",[Comedy],2002,,,,,,,
27275,131258,The Pirates,[Adventure],2014,,,,,,,
27276,131260,Rentun Ruusu,[miss_genre],2001,,,,,,,


In [19]:
#creating manual token mapping 

def extract_date_from_title(text):
  grp = re.search(r'\((\d+)\)', text)
  if grp is not None:
    return pd.Series([re.sub(r'\((\d+)\)', '', text),int(grp.group(1))], index = ['title','year'])
  return pd.Series([text,1995], index = ['title','year'])

def drop_special_chars(text):
  text = re.sub('[^A-Za-z0-9]+', ' ', text)
  return ' '.join(text.split())

def remove_punct(text):
    return ("".join([x for x in text if x not in st.punctuation]))
  
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

def remove_stopwords(text):
    return [x for x in text if x not in nltk.corpus.stopwords.words('english')]

def stemming(text):
    ps = PorterStemmer()
    return [ps.stem(x) for x in text]

def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(x) for x in text]

def apply_text_filter(x):

  x = drop_special_chars (x)
  x = remove_punct(x)
  x = tokenize(x)
  x = remove_stopwords(x)
  x = stemming(x)
  x = lemmatize(x)

  return x

In [None]:
drop_special_chars('abc %bcsz')

'abc bcsz'

In [None]:
movies['clean_tag'] = movies.tag.apply(lambda x : apply_text_filter(x))

In [43]:
tt = pd.read_csv(base_path + 'full/title.akas.tsv', sep='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [48]:
tt[0:100].titleId.apply(lambda x : int(x.replace('tt','')))

0      1
1      1
2      1
3      1
4      1
      ..
95    10
96    10
97    10
98    10
99    10
Name: titleId, Length: 100, dtype: int64

In [52]:
mlens = pd.read_csv(base_path + 'full/link.csv')
mlens['imdbId'] = mlens['imdbId'].apply(lambda x : 'tt'+str(x))
mlens_indx = mlens.set_index('imdbId').index
mlens.sort_values('imdbId')

Unnamed: 0,movieId,imdbId,tmdbId
25546,120869,tt10,774.0
10011,32943,tt100024,36843.0
8992,26696,tt100029,9399.0
21649,104849,tt100031,95730.0
3622,3713,tt100046,51763.0
...,...,...,...
8991,26695,tt99951,21344.0
6253,6352,tt99969,117269.0
26014,124531,tt99991,123770.0
17816,89490,tt999913,64639.0


In [None]:
token_map = {
    
  '007 (series)' : '007',
  '80s' : '1980s',
  'aardman studios':'aardman',
  'action packed' : 'action',
  'adapted from:book' : 'book',
  'adapted from:comic' : 'comic',
  'adapted from:game' : 'game',
  'afi 100 (laughs)' : 'afi 100',
  'afi 100 (movie quotes)' : 'afi 100',
  'alien invasion' : 'alien',
  'aliens' : 'alien',
  'animal movie' : 'animals',
  'animation' : 'animated',
  'art house' : 'art',
  'artistic' : 'art',
  'artsy' : 'art',
  'artist': 'art'
}