In [1]:
# Cargar ARFF y convertir a DataFrame
from scipy.io import arff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import os

os.chdir("..")  # Moverse desde /notebooks al directorio raíz del proyecto
print("Directorio actual:", os.getcwd())

arff_file = 'data/raw/CEE_DATA.arff'
data = arff.loadarff(arff_file)
df = pd.DataFrame(data[0])

# Convertir bytes a string
for col in df.select_dtypes([object]).columns:
    df[col] = df[col].apply(lambda x: x.decode() if isinstance(x, bytes) else x)

# Columnas categóricas
categorical_cols = ['Gender','Caste','coaching','time','Class_ten_education','twelve_education','medium','Father_occupation','Mother_occupation']

# EDA básico: frecuencia
for col in categorical_cols:
    print(df[col].value_counts(normalize=True).round(2))

# Duplicados y nulos
df = df.drop_duplicates()
print(df.isnull().sum())

# Agrupar performance
df['Performance_grouped'] = df['Performance'].replace({'Average':'Average/Good','Good':'Average/Good','Vg':'Vg','Excellent':'Excellent'})
y_grouped = LabelEncoder().fit_transform(df['Performance_grouped'])

# Combinar categorías raras
def combine_rare(df, col, threshold=0.2):
    counts = df[col].value_counts(normalize=True)
    rare = counts[counts < threshold].index
    df[col] = df[col].replace(rare, 'OTHERS')
for col in categorical_cols:
    combine_rare(df, col)

# Ordinal encoding
ordinal_cols = ["Class_ X_Percentage","Class_XII_Percentage"]
ord_map = ["Poor","Average","Good","Vg","Excellent"]
ord_enc = OrdinalEncoder(categories=[ord_map, ord_map])
df[['Class_X_num','Class_XII_num']] = ord_enc.fit_transform(df[ordinal_cols])
df['Academic_Score'] = df[['Class_X_num','Class_XII_num']].mean(axis=1)

# Frecuencia de categorías
for col in categorical_cols:
    df[col+'_freq'] = df[col].map(df[col].value_counts(normalize=True))

# Guardar dataset intermedio
df.to_pickle("data/intermediate/df.pkl")


Directorio actual: C:\Users\Jorge Moya\Documents\Academic\I.T.E.S.M\M.E. Inteligencia Artificial Aplicada\4to Trimestre\Operaciones de aprendizaje automático\Tareas o Trabajos\ml-CEE_DATA-project
Gender
male      0.53
female    0.47
Name: proportion, dtype: float64
Caste
General    0.49
OBC        0.24
ST         0.16
SC         0.10
Name: proportion, dtype: float64
coaching
WA    0.67
NO    0.23
OA    0.10
Name: proportion, dtype: float64
time
TWO      0.55
ONE      0.30
THREE    0.13
FOUR     0.02
FIVE     0.00
SEVEN    0.00
Name: proportion, dtype: float64
Class_ten_education
SEBA      0.59
CBSE      0.37
OTHERS    0.03
Name: proportion, dtype: float64
twelve_education
AHSEC     0.55
CBSE      0.44
OTHERS    0.01
Name: proportion, dtype: float64
medium
ENGLISH     0.80
OTHERS      0.11
ASSAMESE    0.08
Name: proportion, dtype: float64
Father_occupation
OTHERS             0.42
SCHOOL_TEACHER     0.16
BUSINESS           0.15
DOCTOR             0.08
ENGINEER           0.07
COLLEGE_TEAC