# scratch work for building cage recommendation model

### Spellchecker using NLTK SpellChecker

In [7]:
# spell correction
import numpy as np
from spellchecker import SpellChecker
%config IPCompleter.greedy=True

spell = SpellChecker()

misspelled = spell.unknown(['something', 'is', 'happennning', 'here'])

for word in misspelled:
    print(spell.correction(word))

happening


### Synonym generator using NLTK Corpus

```python
# install NLTK modules
import nltk
nltk.download()
```

In [8]:
# identifying synonyms
from nltk.corpus import wordnet as wn

for syn in wn.synsets('mad'):
    print("Definition: " + syn.definition())
    for l in syn.lemmas():
        print(l.name())
    print('\n')

Definition: roused to anger; - Mark Twain
huffy
mad
sore


Definition: affected with madness or insanity
brainsick
crazy
demented
disturbed
mad
sick
unbalanced
unhinged


Definition: marked by uncontrolled excitement or emotion
delirious
excited
frantic
mad
unrestrained


Definition: very foolish
harebrained
insane
mad




### Begin wrangling data and combining the datasets

In [10]:
# import data files
import pandas as pd

### don't need to import anymore after creating cage_movies filtered csv
# genome_tags = pd.read_csv("data/genome-tags.csv")
# genome_scores = pd.read_csv("data/genome-scores.csv")
# cage_movies = pd.read_csv("data/nic-cage.csv")
# movies = pd.read_csv("data/movies.csv")

tags = pd.read_csv("data/tags.csv")
ratings = pd.read_csv("data/ratings.csv")

In [11]:
# get rid of the year information from the title because already contained in cage_movies.csv
res = movies['title'].str.replace(' \([0-9]*\)$', '', regex = True)
movies['title_trimmed'] = res

# remove the trailing space for the movie title
movies = movies.applymap(lambda x: str(x).rstrip())

# get the ", The" at the end of movies[title] and append to front
movies.loc[movies['title_trimmed'].str.contains(', The$'),'title_trimmed'] = 'The ' + movies['title_trimmed']
movies['title_trimmed'] = movies['title_trimmed'].str.replace(', The$', '', regex = True)
movies['title_trimmed'] = movies['title_trimmed'].str.replace(',The$', '', regex = True)

# remove the trailing space for the movie title
movies = movies.applymap(lambda x: str(x).rstrip())

# "Sorcerer's Apprentice" is misspelled 'Sorceror'
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Sorceror', 'Sorcerer')

# Remove period from "Adaption."
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Adaptation.', 'Adaptation', regex = False)

# "Gone in 60 Seconds" is spelled out 'Sixty' in cage_movies
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Sixty', '60')

# "Amos & Andrew" is spelled "Amos and Andrew" in cage_movies
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Amos and', 'Amos &')

# "Time to Kill" needs the Italian suffix in cage_movies
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Time to Kill', 'Time to Kill (Tempo di uccidere)')

# "Best of Times" needs 'The ' prefixed in cage_movies
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Best of Times', 'The Best of Times')

In [12]:
# left join the movie information to the cage_movies dataset using title as key
cage_movies.Movie = cage_movies.Movie.astype(str)
movies.title = movies.title.astype(str)
cage = pd.merge(cage_movies, movies, left_on = 'Movie', right_on = 'title_trimmed', how = 'left')

In [13]:
# check for NaN values
null_columns = cage.columns[cage.isnull().any()]
print(cage[cage.isnull().any(axis=1)])

# Never on Tuesday is NA but it isn't included in the movies dataset so delete it from entire dataset
cage = cage.dropna()
cage = cage.drop(columns = ['title', 'title_trimmed'])

               Movie Rating              Character  Voice  Year  \
99  Never on Tuesday      R  Man in Red Sports Car      0  1989   

   RottenTomatoes movieId title genres title_trimmed  
99              X     NaN   NaN    NaN           NaN  


In [14]:
# filter ratings to only cage movies by merging with cage df
ratings.shape #(25,000,095, 4)
keys = list(cage['movieId'].values) # create list of Cage movieId
ratings_cage = ratings[ratings.movieId.isin(keys)] # filter to only cage movies
ratings_cage.shape #(262,762, 4)

(262762, 4)

In [16]:
# case movieId to int
cage.movieId = cage.movieId.astype(int)
ratings_cage.movieId = ratings_cage.movieId.astype(int)

# import the names of movies to cage_ratings
ratings_cage = pd.merge(ratings_cage, cage[['movieId','Movie', 'Year', 'RottenTomatoes']], on = 'movieId', how = 'left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [22]:
# set types for ratings_cage
ratings_cage.Movie = ratings_cage.Movie.astype(str)

# export cage ratings to csv
path = '/Users/Kevin/cage-search/data/cage_ratings.csv'
ratings_cage.to_csv(path, index = False)

In [23]:
# import cage_ratings
cage_ratings = pd.read_csv('data/cage_ratings.csv')

In [24]:
# merge cage with genome_scores to get associated genome_tags
genome_scores.movieId = genome_scores.movieId.astype(int)
cage_merge = pd.merge(cage, genome_scores, on = 'movieId', how = 'inner')

# get rid of suffix for the join columns
cage_merge.rename(columns={"tagId_x": "tagId", "relevance_x": "relevance"})

# merge with genome_tags to get descriptors
cage_merge.tagId = cage_merge.tagId.astype(int)
genome_tags.tagId = genome_tags.tagId.astype(int)

# merge
cage = pd.merge(cage_merge, genome_tags, on = 'tagId', how = 'inner')

# rename the tags column as genome_tags not to get mixed up 
# with tags column from tags.csv
cage = cage.rename(columns ={"tag": "genome_tag"})

# cast tag as string
cage.genome_tag = cage.genome_tag.astype(str)

In [25]:
# check the largest values for relevance to see if they match up well with the movies
cage.nlargest(10, 'relevance')

Unnamed: 0,Movie,Rating,Character,Voice,Year,RottenTomatoes,movieId,genres,tagId,relevance,genome_tag
35587,Drive Angry,R,Milton,0,2011,47,84942,Action|Fantasy|Thriller,457,1.0,goretastic
81393,National Treasure,PG,Benjamin Franklin Gates,0,2004,45,8972,Action|Adventure|Drama|Mystery|Thriller,1044,0.99975,treasure hunt
17464,Raising Arizona,PG-13,H.I. McDunnough,0,1987,91,1394,Comedy,224,0.9995,coen bros
8378,Ghost Rider,PG-13,Johnny Blaze and Ghost Rider,0,2007,27,51077,Action|Fantasy|Thriller,108,0.99925,based on a comic
81383,National Treasure: Book of Secrets,PG,Benjamin Franklin Gates,0,2007,35,56775,Action|Adventure,1044,0.99925,treasure hunt
69870,Guarding Tess,PG-13,Doug Chesnic,0,1994,56,6992,Comedy|Drama,896,0.999,secret service
76940,Ghost Rider,PG-13,Johnny Blaze and Ghost Rider,0,2007,27,51077,Action|Fantasy|Thriller,987,0.9985,super hero
14398,Gone in 60 Seconds,PG-13,Memphis Raines,0,2000,25,3717,Action|Crime,185,0.99825,cars
48115,Wild at Heart,R,Sailor Ripley,0,1990,65,7044,Crime|Drama|Mystery|Romance|Thriller,617,0.99825,lynch
77086,Kick-Ass,R,Damon Macready and Big Daddy,0,2010,75,76251,Action|Comedy,989,0.998,superhero


In [26]:
# export dataset to csv file in the /data/ directory
# import os
# import sys

# # hoops to jump through to get relative working directory
# abs_dir = sys.path[0]
# rel_dir = os.path.join(abs_dir, '../data')
# data = ''.join([rel_dir, '/cage_movies.csv'])

# # export to csv
# cage.to_csv(data, index = False)

path = '/Users/Kevin/cage-search/data/cage_movies.csv'
cage.to_csv(path, index = False)

In [27]:
# import cage_movies as movs so as to not have to work with huge files
movs = pd.read_csv('data/cage_movies.csv')

Unnamed: 0,userId,movieId,rating,timestamp,Movie,Year,RottenTomatoes
0,2,733,4.5,1141415905,The Rock,1996,66
1,2,4023,3.0,1141417651,The Family Man,2000,53
2,3,4023,4.0,1439473162,The Family Man,2000,53
3,3,6708,4.0,1566090171,Matchstick Men,2003,82
4,3,36529,4.0,1453904094,Lord of War,2005,62
