In [28]:
""" Importing Libraries """

import pandas as pd
import joblib
import os
import pyreadstat

In [29]:
""" Print local path """

def print_local_path():
    """Prints the current working directory."""
    local_path = os.getcwd()
    print(f"Current working directory: {local_path}")

def print_notebook_directory():
    """Prints the directory where this notebook is located."""
    # Para notebooks, podemos usar uma abordagem alternativa
    import inspect
    import pathlib
    
    # Obtém o diretório atual de trabalho
    current_dir = pathlib.Path.cwd()
    print(f"Current working directory: {current_dir}")
    
    # Para o diretório específico do notebook, usamos o diretório atual
    notebook_dir = current_dir / "Descriptive_Learning"
    print(f"Notebook directory: {notebook_dir}")

if __name__ == "__main__":
    print_local_path()
    print_notebook_directory()
    # Example usage

Current working directory: b:\GitHub\UFMG\FUNA_EMM\Descriptive_Learning
Current working directory: b:\GitHub\UFMG\FUNA_EMM\Descriptive_Learning
Notebook directory: b:\GitHub\UFMG\FUNA_EMM\Descriptive_Learning\Descriptive_Learning


In [34]:
""" Loading Data """

def load_data(path):
    # Método 1: Tentar com pandas (funciona para alguns arquivos SPSS)
    try:
        data = pd.read_spss(path)
        print(f"Dados carregados com pandas! Shape: {data.shape}")
        print("Primeiras linhas:")
        print(data.head())
        return data
    except Exception as e:
        print(f"Pandas falhou: {e}")

    # Método 2: Tentar com pyreadstat
    try:
        data, meta = pyreadstat.read_sav(path)
        print(f"Dados carregados com pyreadstat! Shape: {data.shape}")
        print("Primeiras linhas:")
        print(data.head())
        return data
    except ImportError:
        print(
            "pyreadstat não está instalado. Você pode instalar com: pip install pyreadstat")
    except Exception as e:
        print(f"pyreadstat falhou: {e}")

    # Método 3: Verificar se é realmente um arquivo pickle/joblib
    try:
        with open(path, 'rb') as f:
            # Verificar os primeiros bytes para determinar o tipo
            first_bytes = f.read(10)
            print(f"Primeiros bytes do arquivo: {first_bytes}")

        # Se chegou até aqui, tentar joblib mesmo assim
        data = joblib.load(path)
        print(f"Dados carregados com joblib! Shape: {data.shape}")
        print(data.head())
        return data
    except Exception as e:
        print(f"Joblib também falhou: {e}")

    print("Todos os métodos falharam. Este pode ser um arquivo SPSS que precisa de tratamento especial.")
    return None

In [36]:
""" Importing FUNA """

funa_path = '../data_input/FUNA/slice/'

path_targ = funa_path + 'target.pq'
path_desc = funa_path + 'desc.pq'

df_targ = pd.read_parquet(path_targ)
df_desc = pd.read_parquet(path_desc)

In [56]:
""" Carregando dados Curran - Versão Corrigida """

CURRAN_DATA_PATH = '../data_input/Curran/'

def load_curran_sav_data(file_name):
    """Load Curran .sav data file with correct path and filename."""
    sav_path = CURRAN_DATA_PATH + file_name + '.sav'
    
    if not os.path.exists(sav_path):
        print(f"Arquivo não encontrado: {sav_path}")
        return None
    
    print(f"Tentando carregar: {sav_path}")
    loaded_data = load_data(sav_path)
    return loaded_data

# Tentar carregar os arquivos
print("=== Carregando CurranData.sav ===")
curran_data = load_curran_sav_data('CurranData')

print("\n=== Carregando CurranLong.sav ===")
curran_long = load_curran_sav_data('CurranLong')

=== Carregando CurranData.sav ===
Tentando carregar: ../data_input/Curran/CurranData.sav
Dados carregados com pandas! Shape: (405, 15)
Primeiras linhas:
      id  anti1  anti2  anti3  anti4  read1  read2  read3  read4 kidgen  \
0   22.0    1.0    2.0    NaN    NaN    2.1    3.9    NaN    NaN   girl   
1   34.0    3.0    6.0    4.0    5.0    2.1    2.9    4.5    4.5    boy   
2   58.0    0.0    2.0    0.0    1.0    2.3    4.5    4.2    4.6   girl   
3  122.0    0.0    3.0    1.0    1.0    3.7    8.0    NaN    NaN    boy   
4  125.0    1.0    1.0    2.0    1.0    2.3    3.8    4.3    6.2   girl   

   momage  kidage  homecog  homeemo  nmis  
0    28.0    6.08     13.0     10.0   4.0  
1    28.0    6.83      9.0      9.0   0.0  
2    28.0    6.50      9.0      6.0   0.0  
3    28.0    7.83     13.0     10.0   2.0  
4    29.0    7.42     10.0      8.0   0.0  

=== Carregando CurranLong.sav ===
Tentando carregar: ../data_input/Curran/CurranLong.sav
Dados carregados com pandas! Shape: (1393,

In [58]:
""" Loading Curran parquet files """

curran_parquet = {
    'desc': pd.read_parquet(CURRAN_DATA_PATH + 'data/desc.pq'),
    'long': pd.read_parquet(CURRAN_DATA_PATH + 'data/long.pq'),
    'target': pd.read_parquet(CURRAN_DATA_PATH + 'data/target.pq'),
    'wide': pd.read_parquet(CURRAN_DATA_PATH + 'data/wide.pq')
}

In [70]:
""" Showing data info """

def show_data_info(df):
    """Prints the shape and data types of the DataFrame."""
    print(f"DataFrame shape: {df.shape}")
    # Group data types and list the columns for each type
    grouped_types = df.dtypes.value_counts()
    print("Grouped data types:")
    for dtype, count in grouped_types.items():
        cols = df.columns[df.dtypes == dtype].tolist()
        sorted_cols = sorted(cols)
        string_cols = ', '.join(sorted_cols)
        print(f"\t{dtype}: {count} columns -> [{string_cols}]")

In [64]:
""" Exploring FUNA data """

if df_targ is not None:
    print("\nTarget DataFrame Info:")
    show_data_info(df_targ)
if df_desc is not None:
    print("\nDescription DataFrame Info:")
    show_data_info(df_desc)


Target DataFrame Info:
DataFrame shape: (13135, 4)

Grouped data types:
int64: 3 columns -> [DMStimL, DMTime, PreOrd]
object: 1 columns -> [IDCode]

Description DataFrame Info:
DataFrame shape: (774, 32)

Grouped data types:
float64: 28 columns -> [CAAnsCprop, CAAnsCsum, CAIES, CAPreOrdmax, CAtimeCmean, CAtimeCmedian, NCAnsCprop, NCAnsCsum, NCIES, NCPreOrdmax, NCRTinterceptNumDis, NCRTinterceptNumRatio, NCRTslopeNumDis, NCRTslopeNumRatio, NCtimeCmean, NCtimeCmedian, SAAnsCprop, SAAnsCsum, SAIES, SAPreOrdmax, SAtimeCmean, SAtimeCmedian, SSAnsCprop, SSAnsCsum, SSIES, SSPreOrdmax, SStimeCmean, SStimeCmedian]
object: 3 columns -> [IDCode, language, sex]
category: 1 columns -> [grade]


In [74]:
""" Exploring curran data """

# Exibir informações dos DataFrames carregados
if curran_data is not None:
    print("\nCurranData Info:")
    show_data_info(curran_data)

if curran_long is not None:
    print("\nCurranLong Info:")
    show_data_info(curran_long)


CurranData Info:
DataFrame shape: (405, 15)
Grouped data types:
	float64: 14 columns -> [anti1, anti2, anti3, anti4, homecog, homeemo, id, kidage, momage, nmis, read1, read2, read3, read4]
	category: 1 columns -> [kidgen]

CurranLong Info:
DataFrame shape: (1393, 14)
Grouped data types:
	float64: 13 columns -> [anti, homecog, homeemo, id, kidage, kidage6, kidagec, kidagesq, kidagetv, momage, occasion, occasion2, read]
	category: 1 columns -> [kidgen]


## Showing Heads

In [51]:
""" Checking heads of DataFrames """

def print_dataframe_head(df, num_rows=5):
    """Prints the first few rows of the DataFrame."""
    data_frame_rows = df.shape[0]
    print(f"\nFirst {num_rows} rows of the {data_frame_rows} rows DataFrame:")
    print(df.head(num_rows))


### Curran


In [66]:
""" Curran Data """

if curran_data is not None:
    print_dataframe_head(curran_data)


First 5 rows of the 405 rows DataFrame:
      id  anti1  anti2  anti3  anti4  read1  read2  read3  read4 kidgen  \
0   22.0    1.0    2.0    NaN    NaN    2.1    3.9    NaN    NaN   girl   
1   34.0    3.0    6.0    4.0    5.0    2.1    2.9    4.5    4.5    boy   
2   58.0    0.0    2.0    0.0    1.0    2.3    4.5    4.2    4.6   girl   
3  122.0    0.0    3.0    1.0    1.0    3.7    8.0    NaN    NaN    boy   
4  125.0    1.0    1.0    2.0    1.0    2.3    3.8    4.3    6.2   girl   

   momage  kidage  homecog  homeemo  nmis  
0    28.0    6.08     13.0     10.0   4.0  
1    28.0    6.83      9.0      9.0   0.0  
2    28.0    6.50      9.0      6.0   0.0  
3    28.0    7.83     13.0     10.0   2.0  
4    29.0    7.42     10.0      8.0   0.0  


In [67]:
""" Curran Long """

if curran_long is not None:
    print_dataframe_head(curran_long)


First 5 rows of the 1393 rows DataFrame:
     id kidgen  momage  kidage  homecog  homeemo  occasion  anti  read  \
0  22.0   girl    28.0    6.08     13.0     10.0       0.0   1.0   2.1   
1  22.0   girl    28.0    6.08     13.0     10.0       1.0   2.0   3.9   
2  34.0    boy    28.0    6.83      9.0      9.0       0.0   3.0   2.1   
3  34.0    boy    28.0    6.83      9.0      9.0       1.0   6.0   2.9   
4  34.0    boy    28.0    6.83      9.0      9.0       2.0   4.0   4.5   

   kidagetv  kidage6  kidagesq     kidagec  occasion2  
0      6.08     0.08    0.0064    0.000512        0.0  
1      8.08     2.08    4.3264    8.998912        1.0  
2      6.83     0.83    0.6889    0.571787        0.0  
3      8.83     2.83    8.0089   22.665187        1.0  
4     10.83     4.83   23.3289  112.678587        4.0  


### Curran Parquets


In [71]:
""" Curran Parquets """

for key, df in curran_parquet.items():
    print(f"\n=== {key.capitalize()} DataFrame ===")
    show_data_info(df)
    # print_dataframe_head(df)


=== Desc DataFrame ===
DataFrame shape: (405, 17)
Grouped data types:
	float64: 16 columns -> [homecog, homeemo, kidgen, max_age, max_cog, max_emo, min_age, min_cog, min_emo, momage, slope_age, slope_cog, slope_emo, sum_age, sum_cog, sum_emo]
	object: 1 columns -> [id]

=== Long DataFrame ===
DataFrame shape: (1325, 8)
Grouped data types:
	float64: 7 columns -> [homecog, homeemo, intcog, kidagetv, kidgen, momage, occasion]
	object: 1 columns -> [id]

=== Target DataFrame ===
DataFrame shape: (1325, 4)
Grouped data types:
	float64: 3 columns -> [kidagetv, occasion, read]
	object: 1 columns -> [id]

=== Wide DataFrame ===
DataFrame shape: (405, 18)
Grouped data types:
	float64: 17 columns -> [homecog, homeemo, intage_0, intage_1, intage_2, intage_3, intcog_0, intcog_1, intcog_2, intcog_3, intemo_0, intemo_1, intemo_2, intemo_3, kidage, kidgen, momage]
	object: 1 columns -> [id]


In [73]:
""" Heads of Curran Parquets: desc """

print_dataframe_head(curran_parquet['wide'])


First 5 rows of the 405 rows DataFrame:
      id  kidgen  momage  kidage  homecog  homeemo  intage_0  intage_1  \
0   id22     1.0    28.0    6.08     13.0     10.0    170.24    226.24   
1   id34     0.0    28.0    6.83      9.0      9.0    191.24    247.24   
2   id58     1.0    28.0    6.50      9.0      6.0    182.00    238.00   
3  id122     0.0    28.0    7.83     13.0     10.0    219.24    275.24   
4  id125     1.0    29.0    7.42     10.0      8.0    215.18    273.18   

   intage_2  intage_3  intemo_0  intemo_1  intemo_2  intemo_3  intcog_0  \
0       NaN       NaN     60.80     80.80       NaN       NaN     79.04   
1    303.24    359.24     61.47     79.47     97.47    115.47     61.47   
2    294.00    350.00     39.00     51.00     63.00     75.00     58.50   
3       NaN       NaN     78.30     98.30       NaN       NaN    101.79   
4    331.18    389.18     59.36     75.36     91.36    107.36     74.20   

   intcog_1  intcog_2  intcog_3  
0    105.04       NaN       N

### FUNA

In [54]:
""" FUNA DESC """

if df_desc is not None:
    print_dataframe_head(df_desc)


First 5 rows of the 774 rows DataFrame:
         IDCode  NCAnsCsum  NCPreOrdmax  NCAnsCprop  NCtimeCmean  \
0  f3fa-bako357   0.916667         52.0    0.916667     0.124587   
1  f3fa-bane815   0.937500         52.0    0.937500     0.167398   
2   f3fa-baqy06   0.979167         52.0    0.979167     0.095500   
3   f3fa-boky29   0.979167         52.0    0.979167     0.133777   
4  f3fa-bolu326   0.979167         52.0    0.979167     0.139183   

   NCtimeCmedian     NCIES  NCRTslopeNumDis  NCRTinterceptNumDis  \
0       0.170089  0.028049         0.394925             0.351265   
1       0.257034  0.043245         0.399024             0.379670   
2       0.168568  0.025484         0.390103             0.333689   
3       0.214449  0.033593         0.382825             0.369720   
4       0.221039  0.034758         0.384475             0.372100   

   NCRTslopeNumRatio  ...     SSIES  CAAnsCsum  CAPreOrdmax  CAAnsCprop  \
0           0.426656  ...  0.164063   0.125000     0.103093    0.7

In [55]:
""" FUNA TARGET """

# Exibir as primeiras linhas dos DataFrames
if df_targ is not None:
    print_dataframe_head(df_targ)


First 5 rows of the 13135 rows DataFrame:
         IDCode  PreOrd  DMStimL  DMTime
0  f3fa-bako357       2        4     199
1  f3fa-bako357       4        8    1105
2  f3fa-bako357       6        3    2552
3  f3fa-bako357       9        1    1085
4  f3fa-bako357      15        1    1245


In [78]:
""" Explorando a coluna categórica 'kidgen' """

print("=== Análise da coluna 'kidgen' ===")

# Verificar nos dados originais (SPSS)
# if curran_data is not None:
#     print("\nCurranData - kidgen:")
#     print(f"Tipo: {curran_data['kidgen'].dtype}")
#     print(f"Valores únicos: {curran_data['kidgen'].unique()}")
#     print(f"Contagem de valores:")
#     print(curran_data['kidgen'].value_counts())
#     print(f"Categorias: {curran_data['kidgen'].cat.categories if hasattr(curran_data['kidgen'], 'cat') else 'N/A'}")

# if curran_long is not None:
#     print("\nCurranLong - kidgen:")
#     print(f"Tipo: {curran_long['kidgen'].dtype}")
#     print(f"Valores únicos: {curran_long['kidgen'].unique()}")
#     print(f"Contagem de valores:")
#     print(curran_long['kidgen'].value_counts())
#     print(f"Categorias: {curran_long['kidgen'].cat.categories if hasattr(curran_long['kidgen'], 'cat') else 'N/A'}")

# FUNA DESC
if df_desc is not None:
    print("\nFUNA DESC - grade:")
    print(f"Tipo: {df_desc['grade'].dtype}")
    print(f"Valores únicos: {df_desc['grade'].unique()}")
    print(f"Contagem de valores:")
    print(df_desc['grade'].value_counts())
    print(f"Categorias: {df_desc['grade'].cat.categories if hasattr(df_desc['grade'], 'cat') else 'N/A'}")


=== Análise da coluna 'kidgen' ===

FUNA DESC - grade:
Tipo: category
Valores únicos: ['3', '4', '5', '6', '7', '8', '9']
Categories (7, object): ['3', '4', '5', '6', '7', '8', '9']
Contagem de valores:
grade
3    241
4    181
7    164
5     69
6     65
8     40
9     14
Name: count, dtype: int64
Categorias: Index(['3', '4', '5', '6', '7', '8', '9'], dtype='object')
