In [2]:
import numpy as np
from numpy import zeros
import pandas as pd
import pickle

from random import sample
from random import random
from collections import defaultdict

from sklearn.externals import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
from sklearn.decomposition import RandomizedPCA

import nltk
from nltk.tag.perceptron import PerceptronTagger
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer


import re
import os
import codecs
from sklearn import feature_extraction
import matplotlib as mpl
import matplotlib.pyplot as plt
import mpld3


%matplotlib inline

  inline backend."""
  'retina', 'jpeg', 'svg', 'pdf'.""")
  use `figure_formats` instead)""")
  """
  """)
  def _config_changed(self, name, old, new):
  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _type_printers_default(self):
  def _deferred_printers_default(self):


In [3]:
with open('tableaucolors.pkl', 'rb') as file:
    tbl_colors = pickle.load(file)
    
#this is for speeding up pos_tag() since pos_tag() is very slow due to unpickling each time
tagger = PerceptronTagger() 

full_data = pd.read_csv('full_anime_data_set.csv')
full_data.dropna(subset=['synopsis'], inplace=True)
full_data['titlelower'] = full_data['title'].str.lower()
synops = list(full_data['synopsis'])

#remove short synopses
short_indices = []
for i, synop in enumerate(synops):
    if len(synop.split()) <= 25:
        short_indices.append(i)

synops = [synop for i, synop in enumerate(synops) if i not in short_indices]
full_data = full_data.drop(full_data.index[short_indices])

## Preprocessing

In [4]:
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

stops = set([
    "episode", "dvd", "special", "short", "movie", "included", "recap", "season", 
    "second", "film", "volume", "bundled", "tv", "aired", "version", "ova", 
    "tv", "animation", "animated", "releases", "based", "novel", "ova", "released",
    "manga", "edition", "featuring", "features", "created", "called",
    "main", "lot", "named", "feature", "anime", "adaptation", "releases",
    "series", "information", "original", "characters", "new", "story",
    "ovum", "character", "produced", "final", "meet", "scenes", "various", "plot",
    "video", "meeting", "people", "release", "end", "japanese", "japan", "young",
    "episodes", "include", "specials", "animate", "base", "game", "later", "contain",
    "set", "volumes", "bundle", "air", "different", "limit",
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"])

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text.replace("'", '')) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    #stems = [stemmer.stem(t) for t in filtered_tokens]
    stems = [lemmatizer.lemmatize(t, pos='v') for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text.replace("'", '')) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

def clean_synops(synopses, nouns_only = False):
    cleaned_synopses = []
    for synopsis in synopses:
        cleaned_tokens = tokenize_and_stem(synopsis)
        cleaned_tokens = [word.lower() for word in cleaned_tokens]
        if nouns_only:
            tags = tagger.tag(cleaned_tokens)
            cleaned_tokens = [t[0] for t in tags if t[1] == "NN"]

        cleaned_synopses.append(" ".join(cleaned_tokens))
        
    return cleaned_synopses

totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synops:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)
    
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
#print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

cleaned_synops = clean_synops(synops)


## fitting

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.2, max_features=200000,
                                   min_df=0.01, stop_words=stops,
                                   use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_synops)
tfidf_matrix = tfidf_matrix.toarray()

terms = tfidf_vectorizer.get_feature_names()


##### DO EVERYTHING ABOVE WHEN THE PAGE LOADS #####

num_clusters = 9

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)


##### CONSIDER PICKELING THE ABOVE AND THEN UNPICKELING THAT #####

clusters = km.labels_.tolist()

full_data['clusters'] = clusters
full_data.set_index('clusters', inplace=True)
full_data.head()
cluster_names = defaultdict(str)

CPU times: user 15.6 s, sys: 512 ms, total: 16.1 s
Wall time: 17.9 s
CPU times: user 14.2 s, sys: 270 ms, total: 14.5 s
Wall time: 10 s


In [6]:
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(km, 'km.pkl')

  def _ipython_display_formatter_default(self):
  def _singleton_printers_default(self):


['km.pkl', 'km.pkl_01.npy', 'km.pkl_02.npy']

In [7]:
clusters = list(km.labels_)

## Name Clusters

In [8]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    #print("Cluster %d words:" % i, end='')
    
    temp_list = []
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        try:
            temp_list.append(terms[ind])
            #print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
        except:
            pass
    #the line below is for creating labels for the graph using the first threw words
    cluster_names[i] = ', '.join(temp_list[:5])

In [9]:
cluster_names

defaultdict(str,
            {0: 'earth, space, planet, alien, pilot',
             1: 'father, family, mother, live, life',
             2: 'world, mysterious, girl, know, power',
             3: 'school, high, high school, student, girls',
             4: 'love, girl, fall love, fall, day',
             5: 'make, girl, work, come, time',
             6: 'music, song, single, band, produce',
             7: 'power, war, fight, city, battle',
             8: 'team, match, play, girls, join'})

### The stuff below is for the heroku app

In [None]:
joblib.dump(full_data, 'full_data.pkl')
joblib.dump(cleaned_synops, 'cleaned_synops.pkl')
joblib.dump(cluster_names, 'cluster_names.pkl')

## Plotting

#### Take a movie, compute pair wise distances to other movies, get smallest n values, plot those

In [10]:
title_list_lower = list(full_data.titlelower)
title_list = list(full_data.title)

In [11]:
test_movie = "sen to chihiro no kamikakushi"

pwd = pairwise_distances(tfidf_matrix, tfidf_matrix[title_list_lower.index(test_movie)].reshape(1,-1), metric='cosine')

In [28]:
titles_index = title_list_lower.index(test_movie.lower())
good_indices = [i for i in range(len(clusters)) if clusters[i] == clusters[titles_index]]

In [12]:
pwd = [i[0] for i in pwd]

In [13]:
closest = np.array(pwd).argsort()[:17]

In [14]:
closest_loc = [tfidf_matrix[i] for i in closest]

In [15]:
dist = 1 - cosine_similarity(closest_loc)

In [16]:
dist

array([[  2.22044605e-16,   4.45342541e-01,   4.91693320e-01,
          5.10855238e-01,   5.54206520e-01,   5.54752175e-01,
          5.62455364e-01,   5.89639439e-01,   5.92917796e-01,
          6.18977954e-01,   6.22394402e-01,   6.27247235e-01,
          6.27295836e-01,   6.27925177e-01,   6.40015624e-01,
          6.49289348e-01,   6.62943406e-01],
       [  4.45342541e-01,   1.11022302e-16,   1.90089809e-01,
          2.60965523e-01,   5.02966940e-01,   5.60720339e-01,
          3.89455037e-01,   4.40823848e-01,   4.74866859e-01,
          3.93275191e-01,   4.05595661e-01,   6.92207764e-01,
          4.59097208e-01,   6.10838056e-01,   5.33342017e-01,
          5.68709562e-01,   7.57024150e-01],
       [  4.91693320e-01,   1.90089809e-01,  -2.22044605e-16,
          2.70458391e-01,   5.19249091e-01,   5.53524593e-01,
          3.97297462e-01,   4.48006443e-01,   4.60856742e-01,
          4.01068546e-01,   4.47723045e-01,   7.69611869e-01,
          4.50147843e-01,   6.05269314e-01

In [17]:
closest

array([ 174, 3020, 5864, 4211, 6289, 4743, 5123, 4109, 3101, 5503, 2049,
       5887,  775, 1679, 5171, 4148, 4060])

In [18]:
subset_clusters = [clusters[i] for i in closest]

In [19]:
subset_clusters

[1, 8, 5, 5, 5, 5, 3, 7, 5, 3, 3, 1, 2, 7, 4, 7, 3]

In [20]:
titles = [title_list[i] for i in closest]
titles

['Sen to Chihiro no Kamikakushi',
 'Ugokie Kori no Tatehiki',
 'Seirei Tsukai no Blade Dance',
 'Otome Youkai Zakuro',
 'Echigo no Mukashibanashi: Attaten Ganoo',
 'Natsume Yuujinchou Shi',
 'Haitai Nanafa',
 'Yoru no Okite',
 'Natsume Yuujinchou',
 'Date A Live II',
 'Yakusai Kochou',
 'Ponsuke no Haru',
 'Yuu☆Yuu☆Hakusho (Movie)',
 'Nakoruru: Ano Hito kara no Okurimono',
 'Date A Live',
 'Keroro Gunsou Movie 5: Tanjou! Kyuukyoku Keroro, Kiseki no Jikuu-jima, de arimasu!!',
 'Nurarihyon no Mago']

In [21]:
mds = MDS(n_components=2, dissimilarity="precomputed", n_init=3, max_iter=100, n_jobs=-2, random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 600);
      this.fig.toolbar.toolbar.attr("y", 0);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}
        
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=subset_clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')

#define custom css to format the font and to remove the axis labeling
css = """
text.mpld3-text, div.mpld3-tooltip {
  font-family:Arial, Helvetica, sans-serif;
}

g.mpld3-xaxis, g.mpld3-yaxis {
display: none; }

svg.mpld3-figure {
margin-left: -200px;}
"""

# Plot 
fig, ax = plt.subplots(figsize=(15,10)) #set plot size
ax.margins(0.03)# Optional, just adds 5% padding to the autoscaling
ax.set_xlim([-1,1])
ax.set_ylim([-1,1])

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
i=0
for name, group in groups:
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=15, 
                     label=cluster_names[name], mec='none', 
                     color=tbl_colors[i]
                    )
    ax.set_aspect('equal')
    labels = [i for i in group.title]
    #ax.legend(loc='best')
    
    #set tooltip using points, labels and the already defined 'css'
    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    #connect tooltip to fig
    mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    i+=1

    
ax.legend(loc='best', title='', fancybox=True, numpoints=1) #show legend with only one dot

#uncomment the below to export to html
#html = mpld3.fig_to_html(fig)
#print(html)

mpld3.display() #show the plot



In [23]:
subset_data = full_data.iloc[closest]

In [24]:
subset_data

Unnamed: 0_level_0,title,english,id,image,episodes,score,rank,popularity,members,favorites,start_date,end_date,status,type,synopsis,titlelower
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,Sen to Chihiro no Kamikakushi,Spirited Away,199,http://cdn.myanimelist.net/images/anime/10/420...,1,8.93,11,30,259659,6130,2001-07-20,2001-07-20,Finished Airing,Movie,"On the way to their new home, 10-year-old Chih...",sen to chihiro no kamikakushi
8,Ugokie Kori no Tatehiki,,3929,http://cdn.myanimelist.net/images/anime/4/7077...,1,5.84,6440,6642,374,0,1933-12-31,1933-12-31,Finished Airing,Movie,Movie from 1933. Fox spirit turns into a samur...,ugokie kori no tatehiki
5,Seirei Tsukai no Blade Dance,Blade Dance of the Elementalers,22877,http://cdn.myanimelist.net/images/anime/7/6303...,12,7.2,2389,712,41930,213,2014-07-14,2014-09-29,Finished Airing,TV,Only a pure maiden can have the privilege to c...,seirei tsukai no blade dance
5,Otome Youkai Zakuro,Zakuro,8476,http://cdn.myanimelist.net/images/anime/13/752...,13,7.63,1035,709,41963,202,2010-10-05,2010-12-28,Finished Airing,TV,"Westernization, a time when humans and spirits...",otome youkai zakuro
5,Echigo no Mukashibanashi: Attaten Ganoo,Tales from the Snow Country,28117,http://cdn.myanimelist.net/images/anime/11/680...,1,0.0,7411,9786,12,0,2000-05-00,2000-05-00,Finished Airing,OVA,"A collection of four folk tales from Koshiji ,...",echigo no mukashibanashi: attaten ganoo
5,Natsume Yuujinchou Shi,Natsume's Book of Friends Four,11665,http://cdn.myanimelist.net/images/anime/3/3744...,13,8.75,28,550,52845,613,2012-01-03,2012-03-27,Finished Airing,TV,Takashi Natsume continues to return the names ...,natsume yuujinchou shi
3,Haitai Nanafa,,15043,http://cdn.myanimelist.net/images/anime/12/459...,13,6.22,5671,3049,4182,3,2012-10-06,2012-12-29,Finished Airing,TV,Nanafa Kyan lives in Okinawa with her grandmot...,haitai nanafa
7,Yoru no Okite,A Rule Of Dreams,7811,http://cdn.myanimelist.net/images/anime/9/1895...,1,4.6,7313,6334,477,1,1995-00-00,1995-00-00,Finished Airing,Movie,"""Yoru no Okite"" takes us to the sky to accompa...",yoru no okite
5,Natsume Yuujinchou,Natsume's Book of Friends,4081,http://cdn.myanimelist.net/images/anime/7/2885...,13,8.43,120,173,118146,4658,2008-07-08,2008-09-30,Finished Airing,TV,"While most fifteen-year-old boys, in one way o...",natsume yuujinchou
3,Date A Live II,,19163,http://cdn.myanimelist.net/images/anime/5/7600...,10,7.55,1258,387,68666,607,2014-04-12,2014-06-14,Finished Airing,TV,"Shido Itsuka, who used to be a normal high sch...",date a live ii
