## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import nltk
# nltk.download('stopwords')

In [3]:
import string
from nltk.corpus import stopwords

## Data Cleaning & Preprocessing

In [4]:
df = pd.read_csv('ranked_anime.csv')
df.drop(['Unnamed: 0'], inplace=True, axis=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               300 non-null    object 
 1   Rank                300 non-null    int64  
 2   Score               300 non-null    float64
 3   Medium              300 non-null    object 
 4   Number of Episodes  300 non-null    int64  
 5   Episode Length      300 non-null    object 
 6   Start Date          300 non-null    object 
 7   End Date            296 non-null    object 
 8   Premier Season      300 non-null    object 
 9   Source Material     300 non-null    object 
 10  Age Rating          300 non-null    object 
 11  Number of Members   300 non-null    int64  
 12  URLS                300 non-null    object 
 13  Synopses            300 non-null    object 
 14  Genre 1             300 non-null    object 
 15  Genre 2             236 non-null    object 
 16  Genre 3 

In [6]:
df['Synopses']

0      "In order for something to be obtained, someth...
1      The self-proclaimed mad scientist Rintarou Oka...
2      Gintoki, Shinpachi, and Kagura return as the f...
3      Hunter x Hunter is set in a world where Hunter...
4      Seeking to restore humanity’s diminishing hope...
                             ...                        
295    The human eye, a well-known motif in psychedel...
296    An animated film series based on the Ao Oni ga...
297    Pet shop owner's daughter Chika Tokorozawa spe...
298    A surrealistic short from minimalist cartoonis...
299    "Yoru no Okite" takes us to the sky (or to hel...
Name: Synopses, Length: 300, dtype: object

### Punctuation Removal, Word Lowercasing, & Stopword Removal

In [7]:
def text_process(text):
    # Remove punctuations and capitalization with below command
    simple_words = [word.lower() for word in text if word not in string.punctuation]
    for num,letter in enumerate(simple_words):
        if letter == '—':
            simple_words[num] = ' '
            
    # Rejoin priorly formed list in below command to pass to following command
    simple_words = ''.join(simple_words)

    # Remove stopwords with below command
    clean_text = [words for words in simple_words.split() if words.lower() not in stopwords.words('english')]
#     return simple_words
    return clean_text

In [8]:
df['Clean Synopses'] = df['Synopses'].apply(text_process)

In [9]:
df['Clean Synopses']

0      [order, something, obtained, something, equal,...
1      [selfproclaimed, mad, scientist, rintarou, oka...
2      [gintoki, shinpachi, kagura, return, funloving...
3      [hunter, x, hunter, set, world, hunters, exist...
4      [seeking, restore, humanity’s, diminishing, ho...
                             ...                        
295    [human, eye, wellknown, motif, psychedelic, cu...
296    [animated, film, series, based, ao, oni, game,...
297    [pet, shop, owners, daughter, chika, tokorozaw...
298    [surrealistic, short, minimalist, cartoonist, ...
299    [yoru, okite, takes, us, sky, hell, accompany,...
Name: Clean Synopses, Length: 300, dtype: object

## Vectorization

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(df['Synopses'])

In [12]:
print(len(bow_transformer.vocabulary_))

5870


In [16]:
synopses_bow = bow_transformer.transform([df['Synopses']])

In [17]:
print(synopses_bow)

  (0, 1)	3
  (0, 2)	4
  (0, 3)	2
  (0, 6)	2
  (0, 8)	1
  (0, 10)	1
  (0, 11)	1
  (0, 13)	1
  (0, 14)	1
  (0, 15)	2
  (0, 16)	2
  (0, 19)	1
  (0, 21)	1
  (0, 22)	1
  (0, 25)	1
  (0, 32)	1
  (0, 38)	3
  (0, 39)	1
  (0, 40)	1
  (0, 43)	1
  (0, 45)	1
  (0, 49)	1
  (0, 54)	1
  (0, 59)	4
  (0, 60)	1
  :	:
  (0, 5835)	2
  (0, 5836)	2
  (0, 5837)	1
  (0, 5838)	2
  (0, 5839)	1
  (0, 5840)	1
  (0, 5841)	1
  (0, 5842)	1
  (0, 5843)	1
  (0, 5844)	1
  (0, 5845)	3
  (0, 5847)	1
  (0, 5848)	2
  (0, 5849)	1
  (0, 5850)	1
  (0, 5852)	1
  (0, 5854)	2
  (0, 5857)	3
  (0, 5860)	1
  (0, 5861)	1
  (0, 5863)	1
  (0, 5864)	4
  (0, 5865)	1
  (0, 5867)	1
  (0, 5868)	4


In [18]:
print('Shape of Sparse Matrix: ', synopses_bow.shape)
print('Amount of Non-Zero occurences: ', synopses_bow.nnz)

Shape of Sparse Matrix:  (1, 5870)
Amount of Non-Zero occurences:  4622


In [20]:
sparsity = (100.0 * synopses_bow.nnz / (synopses_bow.shape[0] * synopses_bow.shape[1]))
print(f'sparsity: {round(sparsity,2)}%')

sparsity: 78.74%


## Model Creation & Evaluation

In [15]:
from sklearn.model_selection import train_test_split

In [21]:
X = df['Synopses']
y = df['Genre 1']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=101)

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [24]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x7fb5a3d99050>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [25]:
predictions = pipeline.predict(X_test)

In [26]:
from sklearn.metrics import classification_report

In [27]:
print(classification_report(predictions, y_test))

               precision    recall  f1-score   support

       Action       0.76      0.40      0.52        40
    Adventure       0.00      0.00      0.00         2
       Comedy       0.69      0.48      0.56        23
     Dementia       0.56      0.71      0.63         7
        Drama       0.17      1.00      0.29         1
        Ecchi       0.00      0.00      0.00         0
        Music       0.00      0.00      0.00         0
      Mystery       0.00      0.00      0.00         0
Psychological       0.00      0.00      0.00         0
       Sci-Fi       0.25      1.00      0.40         1
Slice of Life       0.33      1.00      0.50         1
       Sports       0.00      0.00      0.00         0
  Super Power       0.00      0.00      0.00         0

     accuracy                           0.47        75
    macro avg       0.21      0.35      0.22        75
 weighted avg       0.68      0.47      0.53        75



  _warn_prf(average, modifier, msg_start, len(result))
