# 1. Data Loading and Exploration

## 1.1 Load the dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

dataset = pd.read_csv('anime_list_encoded.csv')

print(dataset)

## 1.2 Basic statistics and data overview

In [None]:
print(dataset.describe())

## 1.3 Data cleaning (handle missing values, outliers, etc.)

In [None]:
dataset['url'].fillna('URL_NOT_AVAILABLE', inplace=True)

numerical_columns = ['score']
for col in numerical_columns:
    dataset[col].fillna(dataset[col].mean(), inplace=True)

categorical_columns = ['jpName', 'lastUpdate', 'engName', 'source', 'status', 'aired', 'duration', 'rating', 'studios', 'genres', 'producer', 'licensors']
for col in categorical_columns:
    dataset[col].fillna('NOT_AVAILABLE', inplace=True)

dataset['missing_data'] = dataset.isnull().any(axis=1).astype(int)

# print count of datapoints with value nan
print(dataset.isnull().sum())

# 2. Content-based Filtering

## 2.1 Feature extraction and preprocessing

In [6]:
from sklearn.preprocessing import LabelEncoder


labelencoder = LabelEncoder()
dataset['rating'] = labelencoder.fit_transform(dataset['rating'])
dataset['status'] = labelencoder.fit_transform(dataset['status'])
dataset['themes'] = labelencoder.fit_transform(dataset['themes'])

print(dataset['rating'].describe())
print(dataset['status'].describe())
print(dataset['themes'].describe())


# create dataframe of the features used in training the model ( like genres, rating, duration, score, themes)




count    24594.000000
mean         2.190656
std          1.778201
min          0.000000
25%          0.000000
50%          3.000000
75%          3.000000
max          6.000000
Name: rating, dtype: float64
count    24594.000000
mean         1.004229
std          0.178497
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          2.000000
Name: status, dtype: float64


## 2.2 Compute similarity scores between shows

## 2.3 Generate recommendations based on similarity

# 3. Collaborative Filtering

## 3.1 Prepare user-item interaction matrix/data

## 3.2 Implement User-User Collaborative Filtering

## 3.3 Implement Item-Item Collaborative Filtering

# 4. Evaluation

## 4.1 Split data into training and test sets

## 4.2 Define evaluation metrics

## 4.3 Evaluate the performance of recommendation models

# 5. Iteration and Improvement

## 5.1 Analyze evaluation results

## 5.2 Refine models or features based on feedback

## 5.3 Experiment with hybrid methods or other algorithms

# 6. Scalability Considerations

## 6.1 Optimize data structures for faster computation

## 6.2 Consider using specialized libraries or tools

## 6.3 Think about deployment and serving recommendations in real-time

# 7. User Feedback Loop

## 7.1 Implement user feedback mechanisms (like/dislike, ratings)

## 7.2 Adjust recommendation logic based on user feedback

## 7.3 Monitor and continuously improve recommendation quality