# ANIME RECOMMENDER SYSTEM - PREPROCESSING - ANIME - STEP 2

In [17]:
# basic library
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import itertools
import collections
import pickle

In [18]:
# load data
anime_df = pd.read_csv('dataset/processed_dataset/preprocessing/anime_preproc_s1.csv')
anime_df.head()

Unnamed: 0,anime_id,title,score,rating_count,ranked,popularity,members,type,studio,synopsis,...,Super Power,Supernatural,Survival,Suspense,Team Sports,Time Travel,Vampire,Video Game,Visual Arts,Workplace
0,1,Cowboy Bebop,0.960483,0.341453,0.001762,0.001828,0.472131,4,743,Crime is timeless. By the year 2071 humanity h...,...,0,0,0,0,0,0,0,0,0,0
1,5,Cowboy Bebop: Tengoku no Tobira,0.919868,0.077942,0.008438,0.024833,0.096935,0,103,Another day another bounty�such is the life of...,...,0,0,0,0,0,0,0,0,0,0
2,6,Trigun,0.902305,0.132747,0.014975,0.010461,0.191726,4,399,Vash the Stampede is the man with a $$60000000...,...,0,0,0,0,0,0,0,0,0,0
3,7,Witch Hunter Robin,0.795829,0.016483,0.126895,0.073309,0.030338,4,743,Witches are individuals with special powers li...,...,0,1,0,0,0,0,0,0,0,0
4,8,Bouken Ou Beet,0.762898,0.002469,0.193472,0.211081,0.004085,4,789,It is the dark century and the people are suff...,...,0,1,0,0,0,0,0,0,0,0


## Preprocessing step:

1. scaling (numerical features)
2. encoding (categorical features)
3. word embedding
4. image embedding

### Word Embedding

step:
1. word cleaning / General Text Cleaning (NLP)
2. embed word

### word cleaning / General Text Cleaning (NLP)

In [19]:
import re, string
import emoji
import nltk

remove symbols, and other things that wont be used for word analysis (NLP Concept), such as:
- emoji
- hashtags
- symbols/special character
- extra spaces

In [20]:
def remove_hashtags(tweet):
    tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) 
    tweet = " ".join(word.strip() for word in re.split('#|_', tweet)) 
    return tweet

In [21]:
def remove_spaces(text): # remove spaces > 1
    return re.sub("\s\s+" , " ", text)

In [22]:
# remove $ and &
def remove_special_ch(sentences):
    cleaned = []
    for word in sentences.split(' '):
        if ('$' in word) | ('&' in word):
            cleaned.append('')
        else:
            cleaned.append(word)
    return ' '.join(cleaned)

lowercase and remove enter as well. (NLP concept)

In [23]:
def clean_general(text): 
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() # remove \n \r, lowercase
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii char
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) # remove https and @
    
    rmv = string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', rmv)
    text = text.translate(table)
    return text

In [24]:
for idx, row in anime_df.iterrows():
    clean = row['synopsis']
    clean = clean_general(clean)
    clean = remove_hashtags(clean)
    clean = remove_special_ch(clean)
    clean = remove_spaces(clean)
    anime_df.at[idx,'synopsis'] = clean

In [25]:
anime_df.head()

Unnamed: 0,anime_id,title,score,rating_count,ranked,popularity,members,type,studio,synopsis,...,Super Power,Supernatural,Survival,Suspense,Team Sports,Time Travel,Vampire,Video Game,Visual Arts,Workplace
0,1,Cowboy Bebop,0.960483,0.341453,0.001762,0.001828,0.472131,4,743,crime is timeless by the year 2071 humanity ha...,...,0,0,0,0,0,0,0,0,0,0
1,5,Cowboy Bebop: Tengoku no Tobira,0.919868,0.077942,0.008438,0.024833,0.096935,0,103,another day another bountysuch is the life of ...,...,0,0,0,0,0,0,0,0,0,0
2,6,Trigun,0.902305,0.132747,0.014975,0.010461,0.191726,4,399,vash the stampede is the man with a 6000000000...,...,0,0,0,0,0,0,0,0,0,0
3,7,Witch Hunter Robin,0.795829,0.016483,0.126895,0.073309,0.030338,4,743,witches are individuals with special powers li...,...,0,1,0,0,0,0,0,0,0,0
4,8,Bouken Ou Beet,0.762898,0.002469,0.193472,0.211081,0.004085,4,789,it is the dark century and the people are suff...,...,0,1,0,0,0,0,0,0,0,0


### embed word (TF IDF Vectorizer)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

In [27]:
def get_tf_matrix(col):
    tf = TfidfVectorizer(stop_words='english', max_features=100)
    anime_df[col] = anime_df[col].fillna('')

    tf_matrix = tf.fit_transform(anime_df[col])
    tf_matrix.shape
    return tf_matrix

In [28]:
tf_matrix_sypnopsis = get_tf_matrix('synopsis')
tf_matrix_sypnopsis

<23765x100 sparse matrix of type '<class 'numpy.float64'>'
	with 126567 stored elements in Compressed Sparse Row format>

In [29]:
tf_matrix_sypnopsis_dense = tf_matrix_sypnopsis.todense()
tf_matrix_sypnopsis_dense.shape

(23765, 100)

In [30]:
df = pd.DataFrame(tf_matrix_sypnopsis_dense)

In [31]:
anime_df = anime_df.join(df)
anime_df.head()

Unnamed: 0,anime_id,title,score,rating_count,ranked,popularity,members,type,studio,synopsis,...,90,91,92,93,94,95,96,97,98,99
0,1,Cowboy Bebop,0.960483,0.341453,0.001762,0.001828,0.472131,4,743,crime is timeless by the year 2071 humanity ha...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.270343,0.0,0.0
1,5,Cowboy Bebop: Tengoku no Tobira,0.919868,0.077942,0.008438,0.024833,0.096935,0,103,another day another bountysuch is the life of ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,Trigun,0.902305,0.132747,0.014975,0.010461,0.191726,4,399,vash the stampede is the man with a 6000000000...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,Witch Hunter Robin,0.795829,0.016483,0.126895,0.073309,0.030338,4,743,witches are individuals with special powers li...,...,0.0,0.0,0.0,0.0,0.0,0.321865,0.0,0.0,0.0,0.0
4,8,Bouken Ou Beet,0.762898,0.002469,0.193472,0.211081,0.004085,4,789,it is the dark century and the people are suff...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226734,0.432943


image embedding to be done on other notebook

In [32]:
anime_df = anime_df.drop(['title', 'synopsis', 'genre'], axis=1)
anime_df.head()

Unnamed: 0,anime_id,score,rating_count,ranked,popularity,members,type,studio,episode_count,Action,...,90,91,92,93,94,95,96,97,98,99
0,1,0.960483,0.341453,0.001762,0.001828,0.472131,4,743,0.008505,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.270343,0.0,0.0
1,5,0.919868,0.077942,0.008438,0.024833,0.096935,0,103,0.000327,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,0.902305,0.132747,0.014975,0.010461,0.191726,4,399,0.008505,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,0.795829,0.016483,0.126895,0.073309,0.030338,4,743,0.008505,1,...,0.0,0.0,0.0,0.0,0.0,0.321865,0.0,0.0,0.0,0.0
4,8,0.762898,0.002469,0.193472,0.211081,0.004085,4,789,0.01701,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226734,0.432943


In [33]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23765 entries, 0 to 23764
Columns: 185 entries, anime_id to 99
dtypes: float64(106), int64(79)
memory usage: 33.5 MB


## export df to be continued at other notebook

In [34]:
anime_df.to_csv('dataset/processed_dataset/preprocessing/anime_preproc_s2.csv', index=False)