In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../data/anime.csv')

In [None]:
df.info()

In [None]:
df.drop(labels=['Name',"English name","Japanese name",'Aired','Producers','Licensors','Duration','Members','Watching','On-Hold'], axis='columns', inplace=True)

In [None]:
df.info()

In [None]:
print(df["Score"])

In [None]:
print(df["Episodes"])

In [None]:
numeric_cols = [
    'Score', 'Episodes', 'Ranked',
    'Score-10','Score-9','Score-8','Score-7','Score-6',
    'Score-5','Score-4','Score-3','Score-2','Score-1'
]

In [None]:
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
print(df[numeric_cols].isna().sum())

In [None]:
df["Episodes"] =df['Episodes'].fillna(0)

In [None]:
score_cols = ['Score-10','Score-9','Score-8','Score-7','Score-6',
              'Score-5','Score-4','Score-3','Score-2','Score-1']
df[score_cols] = df[score_cols].fillna(0)

In [None]:
print(df[numeric_cols].isna().sum())

In [None]:
df.dropna(subset="Score", inplace=True)

In [None]:
print(df[numeric_cols].isna().sum())

In [None]:
freq = df["Studios"].value_counts().count()
print("Printing the frequency")
display(freq)

In [None]:
cat = df.select_dtypes(include='O').keys()

cat

In [None]:
# Get dummy variables for each unique genre
genre_dummies = df['Genres'].str.get_dummies(sep=', ')

# Merge with original dataframe
df = pd.concat([df, genre_dummies], axis=1)

In [None]:
df.info()

In [None]:
df[['Season','Year']] = df['Premiered'].str.split(' ', expand=True)
df.drop(columns=['Season'], inplace=True)
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [None]:
df.info()

In [None]:
df['Year'].fillna(0, inplace=True)

In [None]:
freq = df["Type"].value_counts()
print("Printing the frequency")
display(freq)

In [None]:
freq = df['Rating'].value_counts()
display(freq)

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
categorical_columns = ['Type', 'Rating']

In [None]:
encoder = OneHotEncoder(categories='auto')

In [None]:
onehot_encoded = encoder.fit_transform(df[categorical_columns]).toarray()

In [None]:
one_hot_df = pd.DataFrame(onehot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

In [None]:
display(one_hot_df)

In [None]:
df.drop(labels=['Genres'], axis='columns', inplace=True)

 3   Type           12421 non-null  object 

 6   Studios        12421 non-null  object 
 

8   Rating         12421 non-null  object 




In [None]:
freq = df['Rating'].value_counts()

In [None]:
display(freq)

In [None]:
type_dummies = pd.get_dummies(df['Type'], prefix='Type')
df = pd.concat([df, type_dummies], axis=1)

In [None]:
df.info()

In [None]:
df[['Rating_index','Rating_info']] = df['Rating'].str.split(' - ', expand=True)
df.drop(columns=['Rating_info'], inplace=True)

rating_dummies = pd.get_dummies(df['Rating_index'], prefix='Rating')
df = pd.concat([df, rating_dummies], axis=1)

In [None]:
df.head()

In [None]:
df.drop(labels=['Rating_index'], axis='columns', inplace=True)
df.info()

In [None]:
df.drop(labels=['Rating_Unknown'], axis='columns', inplace=True)

In [None]:
top_studios = df['Studios'].value_counts().nlargest(10).index
df['Studios'] = df['Studios'].apply(lambda x: x if x in top_studios else 'Other')


In [None]:
display(df['Studios'].value_counts().head(10))

In [None]:
studio_dummies = pd.get_dummies(df['Studios'], prefix='Studio')
df = pd.concat([df, studio_dummies], axis=1)

In [None]:
df.info()

In [None]:
df.drop(labels=['Studios', 'Studio_Unknown', 'Type', 'Rating'], axis='columns', inplace=True)

In [None]:
df.describe()

In [None]:
bool_cols = df.select_dtypes('bool').columns

df[bool_cols] = df[bool_cols].astype(int)

In [None]:
X = df.drop(columns=['Score', 'MAL_ID'])
score_related_cols = ['Score-10','Score-9','Score-8','Score-7',
                      'Score-6','Score-5','Score-4','Score-3','Score-2','Score-1', 'Ranked']

X = X.drop(columns=score_related_cols)
y = df['Score']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = RandomForestRegressor(
    n_estimators=200, random_state=42, n_jobs=-1
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))


In [None]:
X.info()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

importances = pd.Series(model.feature_importances_, index=X.columns)
importances.nlargest(20).plot(kind='barh', figsize=(8,6))
plt.title("Top-20 Feature Importances")
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(model, X, y, cv=5, scoring='r2', n_jobs=-1)
print("R2:", scores)
print("Mean R2:", np.mean(scores))