In [23]:
import pandas as pd
import numpy as np
import ast
from datetime import datetime

df_profiles = pd.read_csv('../../data/raw/profiles.csv')
df_profiles.head(10)

Unnamed: 0,profile,gender,birthday,favorites_anime,link
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans
2,skrn,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn
3,edgewalker00,Male,Sep 5,"['5680', '849', '2904', '3588', '37349']",https://myanimelist.net/profile/edgewalker00
4,aManOfCulture99,Male,"Oct 30, 1999","['4181', '7791', '9617', '5680', '2167', '4382...",https://myanimelist.net/profile/aManOfCulture99
5,eneri,,,"['5114', '4898', '2904', '1575', '1482']",https://myanimelist.net/profile/eneri
6,Waffle_Empress,,"May 29, 1996","['338', '322', '440', '199', '28223', '12815',...",https://myanimelist.net/profile/Waffle_Empress
7,NIGGER_BONER,Male,"Jan 1, 1985","['11061', '30', '6594', '28701', '10087', '674...",https://myanimelist.net/profile/NIGGER_BONER
8,jchang,Male,"Jul 29, 1992","['846', '2904', '5114', '2924', '72']",https://myanimelist.net/profile/jchang
9,shadowsplat,,,[],https://myanimelist.net/profile/shadowsplat


In [24]:
df_profiles = df_profiles.drop(columns=['link'])
print(f'shape of the dataset: {df_profiles.shape}')
df_profiles.head(10)

shape of the dataset: (81727, 4)


Unnamed: 0,profile,gender,birthday,favorites_anime
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2..."
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925..."
2,skrn,,,"['918', '2904', '11741', '17074', '23273', '32..."
3,edgewalker00,Male,Sep 5,"['5680', '849', '2904', '3588', '37349']"
4,aManOfCulture99,Male,"Oct 30, 1999","['4181', '7791', '9617', '5680', '2167', '4382..."
5,eneri,,,"['5114', '4898', '2904', '1575', '1482']"
6,Waffle_Empress,,"May 29, 1996","['338', '322', '440', '199', '28223', '12815',..."
7,NIGGER_BONER,Male,"Jan 1, 1985","['11061', '30', '6594', '28701', '10087', '674..."
8,jchang,Male,"Jul 29, 1992","['846', '2904', '5114', '2924', '72']"
9,shadowsplat,,,[]


In [25]:
df_profiles = df_profiles.drop_duplicates()

In [26]:
df_profiles.shape

(47902, 4)

In [27]:
df_profiles.isnull().sum()

profile                0
gender             17012
birthday           21043
favorites_anime        0
dtype: int64

In [28]:
df_profiles.describe().T

Unnamed: 0,count,unique,top,freq
profile,47902,47885,Wumbo_Combo,2
gender,30890,3,Male,21197
birthday,26859,7708,1993,116
favorites_anime,47902,35395,[],10424


In [29]:
type(df_profiles.iloc[0, 2])

str

In [30]:
# Gender processing: fill missing and create one-hot encoded columns

# Ensure missing genders are set to 'unknown' and normalize strings
df_profiles['gender'] = df_profiles['gender'].fillna('unknown')
df_profiles['gender'] = df_profiles['gender'].astype(str).str.lower().str.strip()

# Create explicit one-hot columns required for the model
df_profiles['gender_male'] = (df_profiles['gender'] == 'male').astype(int)
df_profiles['gender_female'] = (df_profiles['gender'] == 'female').astype(int)
df_profiles['gender_unknown'] = (df_profiles['gender'] == 'unknown').astype(int)


In [31]:
# Birthday processing: extract birth year when full date is provided, compute age and missing flag
from datetime import datetime

def _extract_birth_year(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    if s == '':
        return np.nan
    # Expect full format like: 'Jan 21, 1999'
    try:
        dt = datetime.strptime(s, "%b %d, %Y")
        return dt.year
    except Exception:
        # any other format (e.g. 'Jan 21' or malformed) -> treat as missing
        return np.nan

# Create birth_year, age, and age_missing
df_profiles['birth_year'] = df_profiles['birthday'].apply(_extract_birth_year)
current_year = pd.Timestamp.now().year

def _compute_age(y):
    if pd.isna(y):
        return np.nan
    try:
        return int(current_year - int(y))
    except Exception:
        return np.nan

df_profiles['age'] = df_profiles['birth_year'].apply(_compute_age)
# age_missing: 1 if birthday missing or year unknown, 0 otherwise
df_profiles['age_missing'] = df_profiles['birth_year'].isna().astype(int)


In [32]:
# favorites_anime processing: convert string representations to lists, handle malformed/missing
import ast

def _parse_favorites(x):
    # Missing or NaN
    if pd.isna(x):
        return []
    # If it's already a list
    if isinstance(x, list):
        return [str(i) for i in x]
    s = str(x).strip()
    if s == '':
        return []
    try:
        val = ast.literal_eval(s)
        if isinstance(val, list):
            return [str(i) for i in val]
        else:
            return []
    except Exception:
        # Fallback: try to parse simple comma separated strings (without Python list syntax)
        try:
            parts = [p.strip().strip("'\"") for p in s.strip('[]').split(',') if p.strip() != '']
            return parts if parts != [''] else []
        except Exception:
            return []

# Apply parser and create numeric features
df_profiles['favorites_list'] = df_profiles['favorites_anime'].apply(_parse_favorites)
df_profiles['num_favorites'] = df_profiles['favorites_list'].apply(len)
df_profiles['has_favorites'] = (df_profiles['num_favorites'] > 0).astype(int)


In [33]:
# Assemble final preprocessed dataframe ready for ML
profiles_preprocessed = df_profiles.copy()
# Drop raw columns that are not model-friendly
profiles_preprocessed = profiles_preprocessed.drop(columns=['gender', 'birthday', 'favorites_anime'])

# Ensure numeric dtypes for ML
int_cols = ['gender_male', 'gender_female', 'gender_unknown', 'age_missing', 'num_favorites', 'has_favorites']
for c in int_cols:
    if c in profiles_preprocessed.columns:
        profiles_preprocessed[c] = profiles_preprocessed[c].astype(int)
# keep age as numeric (float) to allow NaNs
if 'age' in profiles_preprocessed.columns:
    profiles_preprocessed['age'] = profiles_preprocessed['age'].astype(float)

# Final quick check
print('profiles_preprocessed shape:', profiles_preprocessed.shape)
print(profiles_preprocessed.dtypes)
profiles_preprocessed.head()


profiles_preprocessed shape: (47902, 10)
profile            object
gender_male         int64
gender_female       int64
gender_unknown      int64
birth_year        float64
age               float64
age_missing         int64
favorites_list     object
num_favorites       int64
has_favorites       int64
dtype: object


Unnamed: 0,profile,gender_male,gender_female,gender_unknown,birth_year,age,age_missing,favorites_list,num_favorites,has_favorites
0,DesolatePsyche,1,0,0,1994.0,31.0,0,"[33352, 25013, 5530, 33674, 1482, 269, 18245, ...",20,1
1,baekbeans,0,1,0,2000.0,25.0,0,"[11061, 31964, 853, 20583, 918, 9253, 34599, 3...",10,1
2,skrn,0,0,1,,,1,"[918, 2904, 11741, 17074, 23273, 32281, 9989, ...",9,1
3,edgewalker00,1,0,0,,,1,"[5680, 849, 2904, 3588, 37349]",5,1
4,aManOfCulture99,1,0,0,1999.0,26.0,0,"[4181, 7791, 9617, 5680, 2167, 4382, 849, 235,...",10,1


In [35]:
df_profiles.isnull().sum()

profile                0
gender                 0
birthday           21043
favorites_anime        0
gender_male            0
gender_female          0
gender_unknown         0
birth_year         26926
age                26926
age_missing            0
favorites_list         0
num_favorites          0
has_favorites          0
dtype: int64

In [36]:
df_profiles.head(10)

Unnamed: 0,profile,gender,birthday,favorites_anime,gender_male,gender_female,gender_unknown,birth_year,age,age_missing,favorites_list,num_favorites,has_favorites
0,DesolatePsyche,male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",1,0,0,1994.0,31.0,0,"[33352, 25013, 5530, 33674, 1482, 269, 18245, ...",20,1
1,baekbeans,female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",0,1,0,2000.0,25.0,0,"[11061, 31964, 853, 20583, 918, 9253, 34599, 3...",10,1
2,skrn,unknown,,"['918', '2904', '11741', '17074', '23273', '32...",0,0,1,,,1,"[918, 2904, 11741, 17074, 23273, 32281, 9989, ...",9,1
3,edgewalker00,male,Sep 5,"['5680', '849', '2904', '3588', '37349']",1,0,0,,,1,"[5680, 849, 2904, 3588, 37349]",5,1
4,aManOfCulture99,male,"Oct 30, 1999","['4181', '7791', '9617', '5680', '2167', '4382...",1,0,0,1999.0,26.0,0,"[4181, 7791, 9617, 5680, 2167, 4382, 849, 235,...",10,1
5,eneri,unknown,,"['5114', '4898', '2904', '1575', '1482']",0,0,1,,,1,"[5114, 4898, 2904, 1575, 1482]",5,1
6,Waffle_Empress,unknown,"May 29, 1996","['338', '322', '440', '199', '28223', '12815',...",0,0,1,1996.0,29.0,0,"[338, 322, 440, 199, 28223, 12815, 2800, 18679...",10,1
7,NIGGER_BONER,male,"Jan 1, 1985","['11061', '30', '6594', '28701', '10087', '674...",1,0,0,1985.0,40.0,0,"[11061, 30, 6594, 28701, 10087, 6746, 918, 153...",10,1
8,jchang,male,"Jul 29, 1992","['846', '2904', '5114', '2924', '72']",1,0,0,1992.0,33.0,0,"[846, 2904, 5114, 2924, 72]",5,1
9,shadowsplat,unknown,,[],0,0,1,,,1,[],0,0


In [38]:
df_profiles.to_parquet("../../data/processed/profiles_processed.parquet", index=False)