# scratch work for building cage recommendation model

### Spellchecker using NLTK SpellChecker

In [32]:
# spell correction
import numpy as np
from spellchecker import SpellChecker
%config IPCompleter.greedy=True

spell = SpellChecker()

misspelled = spell.unknown(['something', 'is', 'happennning', 'here'])

for word in misspelled:
    print(spell.correction(word))

happening


### Synonym generator using NLTK Corpus

In [2]:
# identifying synonyms
from nltk.corpus import wordnet as wn

for syn in wn.synsets('mad'):
    print("Definition: " + syn.definition())
    for l in syn.lemmas():
        print(l.name())
    print('\n')

Definition: roused to anger; - Mark Twain
huffy
mad
sore


Definition: affected with madness or insanity
brainsick
crazy
demented
disturbed
mad
sick
unbalanced
unhinged


Definition: marked by uncontrolled excitement or emotion
delirious
excited
frantic
mad
unrestrained


Definition: very foolish
harebrained
insane
mad




### Begin wrangling data and combining the datasets

In [1]:
# import data files
import pandas as pd
import re

tags = pd.read_csv("data/tags.csv")
genome_tags = pd.read_csv("data/genome-tags.csv")
genome_scores = pd.read_csv("data/genome-scores.csv")
cage_movies = pd.read_csv("data/nic-cage.csv")
movies = pd.read_csv("data/movies.csv")

In [2]:
# get rid of the year information from the title because already contained in cage_movies.csv
res = movies['title'].str.replace(' \([0-9]*\)$', '', regex = True)
movies['title_trimmed'] = res

# remove the trailing space for the movie title
movies = movies.applymap(lambda x: str(x).rstrip())

# get the ", The" at the end of movies[title] and append to front
movies.loc[movies['title_trimmed'].str.contains(', The$'),'title_trimmed'] = 'The ' + movies['title_trimmed']
movies['title_trimmed'] = movies['title_trimmed'].str.replace(', The$', '', regex = True)
movies['title_trimmed'] = movies['title_trimmed'].str.replace(',The$', '', regex = True)

# remove the trailing space for the movie title
movies = movies.applymap(lambda x: str(x).rstrip())

# "Sorcerer's Apprentice" is misspelled 'Sorceror'
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Sorceror', 'Sorcerer')

# Remove period from "Adaption."
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Adaptation.', 'Adaptation', regex = False)

# "Gone in 60 Seconds" is spelled out 'Sixty' in cage_movies
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Sixty', '60')

# "Amos & Andrew" is spelled "Amos and Andrew" in cage_movies
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Amos and', 'Amos &')

# "Time to Kill" needs the Italian suffix in cage_movies
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Time to Kill', 'Time to Kill (Tempo di uccidere)')

# "Best of Times" needs 'The ' prefixed in cage_movies
cage_movies['Movie'] = cage_movies['Movie'].str.replace('Best of Times', 'The Best of Times')

In [3]:
# left join the movie information to the cage_movies dataset using title as key
cage_movies.Movie = cage_movies.Movie.astype(str)
movies.title = movies.title.astype(str)
cage = pd.merge(cage_movies, movies, left_on = 'Movie', right_on = 'title_trimmed', how = 'left')

In [4]:
# check for NaN values
null_columns = cage.columns[cage.isnull().any()]
print(cage[cage.isnull().any(axis=1)])

               Movie Rating              Character  Voice  Year  \
99  Never on Tuesday      R  Man in Red Sports Car      0  1989   

   RottenTomatoes movieId title genres title_trimmed  
99              X     NaN   NaN    NaN           NaN  


Unnamed: 0,Movie,Rating,Character,Voice,Year,RottenTomatoes,movieId,title,genres,title_trimmed
0,A Score to Settle,NR,Frank Pierce,0,2019,13,204656,A Score to Settle (2019),Action|Drama|Thriller,A Score to Settle
1,Spider-Man: Into the Spider-Verse,PG,Spider-Man Noir,1,2018,97,195159,Spider-Man: Into the Spider-Verse (2018),Action|Adventure|Animation|Sci-Fi,Spider-Man: Into the Spider-Verse
2,Between Worlds,R,Joe,0,2018,X,194991,Between Worlds (2018),Action|Mystery|Thriller,Between Worlds
3,Teen Titans Go! To the Movies,PG,Superman,1,2018,91,191687,Teen Titans Go! To the Movies (2018),Animation|Children,Teen Titans Go! To the Movies
4,211,R,Mike Chandler,0,2018,5,187713,211 (2018),(no genres listed),211
...,...,...,...,...,...,...,...,...,...,...
107,Racing with the Moon,PG,Nicky,0,1984,60,26521,Racing with the Moon (1984),Comedy|Drama|Romance,Racing with the Moon
108,Rumble Fish,R,Smokey,0,1983,70,26485,Rumble Fish (1983),Drama,Rumble Fish
109,Valley Girl,R,Randy,0,1983,82,6638,Valley Girl (1983),Comedy|Romance,Valley Girl
110,Fast Times at Ridgemont High,R,Brad's Bud,0,1982,78,3210,Fast Times at Ridgemont High (1982),Comedy|Drama|Romance,Fast Times at Ridgemont High


In [6]:
# merge with genome_scores to get associated genome_tags
cage.movieId = cage.movieId.astype(int)
genome_scores.movieId = genome_scores.movieId.astype(int)
cage_merge = pd.merge(cage, genome_scores, on = 'movieId', how = 'inner')
# get rid of suffix for the join columns
cage_merge.rename(columns={"tagId_x": "tagId", "relevance_x": "relevance"})

# merge with genome_tags to get descriptors
cage_merge.tagId = cage_merge.tagId.astype(int)
genome_tags.tagId = genome_tags.tagId.astype(int)
cage = pd.merge(cage_merge, genome_tags, on = 'tagId', how = 'inner')

# Never on Tuesday is NA but it isn't included in the movies dataset so delete it from entire dataset
cage = cage.dropna()
cage = cage.drop(columns = ['title', 'title_trimmed'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [18]:
# check the largest values for relevance to see if they match up well with the movies
cage.nlargest(1000, 'relevance')

Unnamed: 0,Movie,Rating,Character,Voice,Year,RottenTomatoes,movieId,genres,tagId,relevance,tag
35587,Drive Angry,R,Milton,0,2011,47,84942,Action|Fantasy|Thriller,457,1.00000,goretastic
81393,National Treasure,PG,Benjamin Franklin Gates,0,2004,45,8972,Action|Adventure|Drama|Mystery|Thriller,1044,0.99975,treasure hunt
17464,Raising Arizona,PG-13,H.I. McDunnough,0,1987,91,1394,Comedy,224,0.99950,coen bros
8378,Ghost Rider,PG-13,Johnny Blaze and Ghost Rider,0,2007,27,51077,Action|Fantasy|Thriller,108,0.99925,based on a comic
81383,National Treasure: Book of Secrets,PG,Benjamin Franklin Gates,0,2007,35,56775,Action|Adventure,1044,0.99925,treasure hunt
...,...,...,...,...,...,...,...,...,...,...,...
47477,Con Air,R,Cameron Poe,0,1997,55,1552,Action|Adventure|Thriller,609,0.77275,lone hero
77602,Raising Arizona,PG-13,H.I. McDunnough,0,1987,91,1394,Comedy,995,0.77275,surreal
539,Peggy Sue Got Married,PG-13,Charlie Bodell,0,1986,85,2469,Comedy|Drama,7,0.77225,1960s
36379,Grindhouse,R,Fu Manchu,0,2007,84,52281,Action|Crime|Horror|Sci-Fi|Thriller,467,0.77225,great dialogue


In [20]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455
