## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import nltk
# nltk.download('stopwords')

In [3]:
import string
from nltk.corpus import stopwords

## Data Cleaning & Preprocessing

In [4]:
df = pd.read_csv('ranked_anime.csv')
df.drop(['Unnamed: 0'], inplace=True, axis=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               300 non-null    object 
 1   Rank                300 non-null    int64  
 2   Score               300 non-null    float64
 3   Medium              300 non-null    object 
 4   Number of Episodes  300 non-null    int64  
 5   Episode Length      300 non-null    object 
 6   Start Date          300 non-null    object 
 7   End Date            296 non-null    object 
 8   Premier Season      300 non-null    object 
 9   Source Material     300 non-null    object 
 10  Age Rating          300 non-null    object 
 11  Number of Members   300 non-null    int64  
 12  URLS                300 non-null    object 
 13  Synopses            300 non-null    object 
 14  Genre 1             300 non-null    object 
 15  Genre 2             236 non-null    object 
 16  Genre 3 

In [6]:
df['Synopses']

0      "In order for something to be obtained, someth...
1      The self-proclaimed mad scientist Rintarou Oka...
2      Gintoki, Shinpachi, and Kagura return as the f...
3      Hunter x Hunter is set in a world where Hunter...
4      Seeking to restore humanity’s diminishing hope...
                             ...                        
295    The human eye, a well-known motif in psychedel...
296    An animated film series based on the Ao Oni ga...
297    Pet shop owner's daughter Chika Tokorozawa spe...
298    A surrealistic short from minimalist cartoonis...
299    "Yoru no Okite" takes us to the sky (or to hel...
Name: Synopses, Length: 300, dtype: object

In [26]:
df[(df['Genre 1'] == 'Action') | (df['Genre 1'] == 'Dementia')]

Unnamed: 0,Title,Rank,Score,Medium,Number of Episodes,Episode Length,Start Date,End Date,Premier Season,Source Material,...,Genre 1,Genre 2,Genre 3,Genre 4,Genre 5,Genre 6,Genre 7,Genre 8,Total Minutes,Synopsis Length
0,Fullmetal Alchemist: Brotherhood,1,9.21,TV,64,24 min. per ep.,Apr 2009,Jul 2010,Spring,Manga,...,Action,Military,Adventure,Comedy,Drama,Magic,Fantasy,Shounen,1536,223
2,Gintama°,3,9.11,TV,51,24 min. per ep.,Apr 2015,Mar 2016,Spring,Manga,...,Action,Comedy,Historical,Parody,Samurai,Sci-Fi,Shounen,,1224,185
3,Hunter x Hunter (2011),4,9.11,TV,148,23 min. per ep.,Oct 2011,Sep 2014,Fall,Manga,...,Action,Adventure,Fantasy,Shounen,Super Power,,,,3404,164
4,Shingeki no Kyojin Season 3 Part 2,5,9.09,TV,10,23 min. per ep.,Apr 2019,Jul 2019,Spring,Manga,...,Action,Military,Mystery,Super Power,Drama,Fantasy,Shounen,,230,130
6,Gintama',7,9.08,TV,51,24 min. per ep.,Apr 2011,Mar 2012,Spring,Manga,...,Action,Sci-Fi,Comedy,Historical,Parody,Samurai,Shounen,,1224,161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,Idol Fight Suchie-Pai 2,10790,4.29,OVA,1,30 min.\n,Apr 1996,Apr 1996,Unavailable,Game,...,Action,Ecchi,,,,,,,30,85
290,Ai,10791,4.28,Movie,1,4 min.\n,1963,1963,Unavailable,Original,...,Dementia,Psychological,Romance,,,,,,4,19
293,Docchi mo Maid,10794,4.27,ONA,1,13 min.\n,2004,2004,Unavailable,Original,...,Action,Comedy,Magic,Ecchi,Shoujo Ai,,,,13,28
295,4.Eyes,10796,4.24,Movie,1,9 min.\n,1975,1975,Unavailable,Original,...,Dementia,,,,,,,,9,57


In [8]:
truncdf = df[(df['Genre 1'] == 'Action') | (df['Genre 1'] == 'Dementia')]

### Punctuation Removal, Word Lowercasing, & Stopword Removal

In [9]:
def text_process(text):
    # Remove punctuations and capitalization with below command
    simple_words = [word.lower() for word in text if word not in string.punctuation]
    for num,letter in enumerate(simple_words):
        if letter == '—':
            simple_words[num] = ' '
            
    # Rejoin priorly formed list in below command to pass to following command
    simple_words = ''.join(simple_words)

    # Remove stopwords with below command
    clean_text = [words for words in simple_words.split() if words.lower() not in stopwords.words('english')]
#     return simple_words
    return clean_text

In [10]:
truncdf['Clean Synopses'] = truncdf['Synopses'].apply(text_process)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [11]:
truncdf['Clean Synopses']

0      [order, something, obtained, something, equal,...
2      [gintoki, shinpachi, kagura, return, funloving...
3      [hunter, x, hunter, set, world, hunters, exist...
4      [seeking, restore, humanity’s, diminishing, ho...
6      [oneyear, hiatus, shinpachi, shimura, returns,...
                             ...                        
289    [every, century, legendary, mahjong, dials, ca...
290    [short, experimental, anime, woman, desperatel...
293    [izumi, 12yearold, schoolgirl, suddenly, gets,...
295    [human, eye, wellknown, motif, psychedelic, cu...
299    [yoru, okite, takes, us, sky, hell, accompany,...
Name: Clean Synopses, Length: 117, dtype: object

## Vectorization

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(truncdf['Synopses'])

In [28]:
print(len(bow_transformer.vocabulary_))

2972


In [15]:
synopses_bow = bow_transformer.transform([truncdf['Synopses']])

In [16]:
print(synopses_bow)

  (0, 0)	4
  (0, 1)	1
  (0, 2)	2
  (0, 3)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1
  (0, 11)	1
  (0, 19)	1
  (0, 20)	1
  (0, 22)	1
  (0, 30)	1
  (0, 31)	1
  (0, 32)	1
  (0, 35)	1
  (0, 36)	1
  (0, 37)	2
  (0, 38)	1
  (0, 39)	1
  (0, 40)	4
  (0, 41)	6
  (0, 42)	1
  (0, 43)	1
  (0, 45)	1
  (0, 47)	1
  :	:
  (0, 2937)	1
  (0, 2938)	1
  (0, 2939)	2
  (0, 2940)	8
  (0, 2941)	11
  (0, 2942)	1
  (0, 2945)	1
  (0, 2946)	2
  (0, 2947)	16
  (0, 2949)	1
  (0, 2950)	5
  (0, 2952)	1
  (0, 2954)	1
  (0, 2955)	15
  (0, 2956)	1
  (0, 2958)	2
  (0, 2959)	1
  (0, 2960)	1
  (0, 2961)	2
  (0, 2964)	1
  (0, 2966)	1
  (0, 2968)	1
  (0, 2969)	1
  (0, 2970)	1
  (0, 2971)	1


In [17]:
print('Shape of Sparse Matrix: ', synopses_bow.shape)
print('Amount of Non-Zero occurences: ', synopses_bow.nnz)

Shape of Sparse Matrix:  (1, 2972)
Amount of Non-Zero occurences:  2374


In [18]:
sparsity = (100.0 * synopses_bow.nnz / (synopses_bow.shape[0] * synopses_bow.shape[1]))
print(f'sparsity: {round(sparsity,2)}%')

sparsity: 79.88%


## Model Creation & Evaluation

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X = truncdf['Synopses']
y = truncdf['Genre 1']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=101)

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [22]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x7fb4b8a44950>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [23]:
predictions = pipeline.predict(X_test)

In [24]:
from sklearn.metrics import classification_report

In [25]:
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

      Action       0.91      0.87      0.89        23
    Dementia       0.62      0.71      0.67         7

    accuracy                           0.83        30
   macro avg       0.77      0.79      0.78        30
weighted avg       0.84      0.83      0.84        30

