<a href="https://colab.research.google.com/github/khamkarajinkya/Recommender-Practice/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from google.colab import drive 
import logging
import string as st
import re
import nltk
from nltk import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
#mount drive
drive.mount('/content/gdrive')
base_path = '/content/gdrive/My Drive/Data/movie-lens/'

Mounted at /content/gdrive


In [None]:
'''
Count the number of unique tokens for each feature

'''

feature_count = {}

#ratings df
rating = pd.read_csv(base_path + 'full/rating.csv')
feature_count['userId']  = rating.userId.nunique()
feature_count['movieId'] = rating.movieId.nunique()

#genome tags
genome_tags = pd.read_csv(base_path + 'full/genome_tags.csv')
feature_count['tagId'] = genome_tags.tagId.nunique()

#genres
movie = pd.read_csv(base_path + 'full/movies.csv')
feature_count['genres'] = len(set([item for sublist in movie.genres.apply(lambda x : x.split('|')).tolist() for item in sublist]))

In [None]:
movies = pd.read_csv(base_path + 'full/genome_tags.csv')

In [None]:
movie = pd.read_csv(base_path + 'full/movie.csv')

In [None]:
movie[['title','year']] = movie.title.apply(lambda x : extract_date_from_title(x))

In [None]:
movie

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
27273,131254,Kein Bund für's Leben,Comedy,2007
27274,131256,"Feuer, Eis & Dosenbier",Comedy,2002
27275,131258,The Pirates,Adventure,2014
27276,131260,Rentun Ruusu,(no genres listed),2001


In [None]:
#creating manual token mapping 

def extract_date_from_title(text):
  grp = re.search(r'\((\d+)\)', text)
  if grp is not None:
    return pd.Series([re.sub(r'\((\d+)\)', '', text),int(grp.group(1))], index = ['title','year'])
  return pd.Series([text,1995], index = ['title','year'])

def drop_special_chars(text):
  text = re.sub('[^A-Za-z0-9]+', ' ', text)
  return ' '.join(text.split())

def remove_punct(text):
    return ("".join([x for x in text if x not in st.punctuation]))
  
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

def remove_stopwords(text):
    return [x for x in text if x not in nltk.corpus.stopwords.words('english')]

def stemming(text):
    ps = PorterStemmer()
    return [ps.stem(x) for x in text]

def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(x) for x in text]

def apply_text_filter(x):

  x = drop_special_chars (x)
  x = remove_punct(x)
  x = tokenize(x)
  x = remove_stopwords(x)
  x = stemming(x)
  x = lemmatize(x)

  return x

In [None]:
drop_special_chars('abc %bcsz')

'abc bcsz'

In [None]:
movies['clean_tag'] = movies.tag.apply(lambda x : apply_text_filter(x))

In [None]:
movies[250:300]

Unnamed: 0,tagId,tag,clean_tag
250,251,conspiracy,[conspiraci]
251,252,conspiracy theory,"[conspiraci, theori]"
252,253,controversial,[controversi]
253,254,cooking,[cook]
254,255,cool,[cool]
255,256,corny,[corni]
256,257,corporate america,"[corpor, america]"
257,258,corruption,[corrupt]
258,259,costume drama,"[costum, drama]"
259,260,courage,[courag]


In [None]:
token_map = {
    
  '007 (series)' : '007',
  '80s' : '1980s',
  'aardman studios':'aardman',
  'action packed' : 'action',
  'adapted from:book' : 'book',
  'adapted from:comic' : 'comic',
  'adapted from:game' : 'game',
  'afi 100 (laughs)' : 'afi 100',
  'afi 100 (movie quotes)' : 'afi 100',
  'alien invasion' : 'alien',
  'aliens' : 'alien',
  'animal movie' : 'animals',
  'animation' : 'animated',
  'art house' : 'art',
  'artistic' : 'art',
  'artsy' : 'art',
  'artist': 'art'
}