In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('../data/anime.csv')

In [3]:
df.drop(labels=['Name',"English name","Japanese name",'Aired','Producers','Licensors','Duration','Members','Watching','On-Hold'], axis='columns', inplace=True)

In [4]:
df.info()

this is code for computing baseline for weights calculation 

In [8]:
df['Score-1']

In [5]:
df.describe()

In [11]:
score_columns = ['Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
                 'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1']

In [12]:
df = df.dropna(subset=score_columns)

In [19]:
df[score_columns] = df[score_columns].replace('Unknown', np.nan)

In [27]:
df = df.dropna(subset=score_columns)

In [28]:
df[score_columns] = df[score_columns].astype(float).astype(int)

In [29]:
df['num_scored'] =  df[score_columns].sum(axis=1)

In [33]:
print(df['num_scored'].describe())
print("\nPercentiles:")
print(df['num_scored'].quantile([0.25, 0.3,  0.5, 0.55, 0.6,0.75, 0.8,  0.9]))

this results (percentiles) will be used as baseline

In [191]:
numeric_cols = [
    'Score', 'Episodes', 'Ranked',
    'Score-10','Score-9','Score-8','Score-7','Score-6',
    'Score-5','Score-4','Score-3','Score-2','Score-1'
]

In [192]:
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [193]:
print(df[numeric_cols].isna().sum())

In [194]:
df["Episodes"] =df['Episodes'].fillna(0)

In [195]:
score_cols = ['Score-10','Score-9','Score-8','Score-7','Score-6',
              'Score-5','Score-4','Score-3','Score-2','Score-1']
df[score_cols] = df[score_cols].fillna(0)

In [None]:
print(df[numeric_cols].isna().sum())

In [197]:
cat = df.select_dtypes(include='O').keys()

cat

In [198]:
def preprocess(df):
    df = df.copy()
    df = df.dropna(subset=['Score'])
    
    categorical_cols = ['Studios', 'Genres', 'Rating', 'Source', 'Type']
    for col in categorical_cols:
        df[col] = df[col].fillna('Unknown')
        df[col] = df[col].replace('', 'Unknown')
    
    numeric_cols = ['Episodes', 'Ranked']
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    return df

In [199]:
df = preprocess(df)

In [200]:
# Get dummy variables for each unique genre
genre_dummies = df['Genres'].str.get_dummies(sep=', ')

# Merge with original dataframe
df = pd.concat([df, genre_dummies], axis=1)

In [None]:
df.info()

In [202]:
df[['Season','Year']] = df['Premiered'].str.split(' ', expand=True)
df.drop(columns=['Season'], inplace=True)
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [203]:
if 'Year' in df.columns:
    df['Year'] = 2025 - df['Year']

In [204]:
if 'Year' in df.columns:
        df['age_category'] = pd.cut(
            df['Year'],
            bins=[-1, 2, 5, 10, 20, np.inf],
            labels=['new', 'recent', 'modern', 'old', 'classic']
        )
        df = pd.get_dummies(df, columns=['age_category'], drop_first=True)
        df = df.drop(columns=['Year'])

In [205]:
if 'Episodes' in df.columns:
        df['episode_cat'] = pd.cut(
            df['Episodes'],
            bins=[0,1,10,18,26,57,np.inf],
            labels=['single', 'short', 'one_season','two_season', 'long', 'very_long'] # 1, 2-10, 11-18, 18-26,25-57,58-
        )
        df = pd.get_dummies(df, columns=['episode_cat'], drop_first=True)
        df = df.drop(columns=['Episodes'])

In [None]:
df.info()

In [207]:
freq = df["Type"].value_counts()
print("Printing the frequency")
display(freq)

In [208]:
freq = df['Rating'].value_counts()
display(freq)

In [209]:
from sklearn.preprocessing import OneHotEncoder

In [210]:
categorical_columns = ['Type', 'Rating']

In [211]:
encoder = OneHotEncoder(categories='auto')

In [212]:
onehot_encoded = encoder.fit_transform(df[categorical_columns]).toarray()

In [213]:
one_hot_df = pd.DataFrame(onehot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

In [None]:
display(one_hot_df)

In [215]:
type_dummies = pd.get_dummies(df['Type'], prefix='Type')
df = pd.concat([df, type_dummies], axis=1)

In [216]:
df[['Rating_index','Rating_info']] = df['Rating'].str.split(' - ', expand=True)
df.drop(columns=['Rating_info'], inplace=True)

rating_dummies = pd.get_dummies(df['Rating_index'], prefix='Rating')
df = pd.concat([df, rating_dummies], axis=1)

In [217]:
df.head()

In [218]:
df.drop(labels=['Genres', 'Type', 'Type_TV', 'Rating', 'Premiered', 'Rating_index', 'Rating_PG-13','Rating_Unknown'], axis='columns', inplace=True)

In [221]:
# df.drop(labels=['Rating_Unknown'], axis='columns', inplace=True)

In [222]:
top_studios = df['Studios'].value_counts().nlargest(10).index
df['Studios'] = df['Studios'].apply(lambda x: x if x in top_studios else 'Other')

In [None]:
display(df['Studios'].value_counts().head(10))

In [224]:
studio_dummies = pd.get_dummies(df['Studios'], prefix='Studio')
df = pd.concat([df, studio_dummies], axis=1)

In [None]:
df.info()

In [226]:
df.drop(labels=['Studios', 'Studio_Unknown', 'Unknown'], axis='columns', inplace=True)

In [227]:
df.drop(labels=[ 'Source'], axis='columns', inplace=True)

In [228]:
df.info()

In [229]:
bool_cols = df.select_dtypes('bool').columns

df[bool_cols] = df[bool_cols].astype(int)

In [230]:
X = df.drop(columns=['Score', 'MAL_ID'])
score_related_cols = ['Score-10','Score-9','Score-8','Score-7',
                      'Score-6','Score-5','Score-4','Score-3','Score-2','Score-1', 'Ranked']

X = X.drop(columns=score_related_cols)
y = df['Score']

In [231]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.info()

In [233]:
scaler = MinMaxScaler()

numeric_cols = ['Popularity', 'Favorites', 'Completed', 'Dropped', 'Plan to Watch']
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])

In [234]:
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [235]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = RandomForestRegressor(
    n_estimators=200, random_state=42, n_jobs=-1
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))


In [None]:
X.info()

In [236]:
import matplotlib.pyplot as plt
import pandas as pd

importances = pd.Series(model.feature_importances_, index=X.columns)
importances.nlargest(20).plot(kind='barh', figsize=(8,6))
plt.title("Top-20 Feature Importances")
plt.show()

In [237]:
from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(model, X, y, cv=5, scoring='r2', n_jobs=-1)
print("R2:", scores)
print("Mean R2:", np.mean(scores))