# Regressão logística

In [None]:
!pip install kagglehub[pandas-datasets]



In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
import numpy as np

In [None]:
# Título do arquivo a ser descarregado
file_path = "Student_Performance.csv"

In [None]:
# Coleta de dados e armazenamento em dataframe Pandas
df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS,"nikhil7280/student-performance-multiple-linear-regression",file_path)

  df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS,"nikhil7280/student-performance-multiple-linear-regression",file_path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/nikhil7280/student-performance-multiple-linear-regression?dataset_version_number=1&file_name=Student_Performance.csv...


100%|██████████| 171k/171k [00:00<00:00, 328kB/s]


In [None]:
df.shape

(10000, 6)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [21]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [25]:
# Converte a variável alvo para tipo numérico
le = LabelEncoder()
y = le.fit_transform(y)
class_names = le.classes_
print(f"Mapeamento: {dict(zip(class_names, range(len(class_names))))}")

Mapeamento: {np.int64(0): 0, np.int64(1): 1}


In [26]:
# Identificar colunas numéricas para regressão
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

In [27]:
target_col = 'Extracurricular Activities'

In [29]:
# Para análise de correlação das features numéricas com a variável alvo (codificada) em regressão logística.
# A variável alvo 'Extracurricular Activities' precisa ser codificada para calcular a correlação numérica.
le_corr = LabelEncoder()
df_for_corr = df.copy()
df_for_corr[target_col + '_encoded'] = le_corr.fit_transform(df_for_corr[target_col])

# Adicionar a coluna codificada à lista de colunas numéricas temporariamente para cálculo de correlação
# Certifique-se de que a coluna target original não está em numeric_columns.
# Se target_col for 'Extracurricular Activities', ela não estará em numeric_columns.
all_numeric_for_corr = numeric_columns + [target_col + '_encoded']

# Calcular correlações e filtrar pela coluna alvo codificada
# Acessar a coluna alvo codificada do resultado da correlação
correlations = df_for_corr[all_numeric_for_corr].corr()[target_col + '_encoded'].abs().sort_values(ascending=False)

print("Correlações absolutas das features numéricas com a variável alvo 'Extracurricular Activities' (codificada):")
print(correlations)

Correlações absolutas das features numéricas com a variável alvo 'Extracurricular Activities' (codificada):
Extracurricular Activities_encoded    1.000000
Performance Index                     0.024525
Sleep Hours                           0.023284
Sample Question Papers Practiced      0.013103
Previous Scores                       0.008369
Hours Studied                         0.003873
Name: Extracurricular Activities_encoded, dtype: float64


Ao estabelecer a correlação entre as demais colunas com o Extracurricular Activities vemos que está é muito baixa o que indica pouca chance de sucesso.

In [None]:
y = df[target_col]

In [None]:
# Preparar features (X)
X = df.drop(target_col, axis=1)

In [None]:
# Dividir dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
print(f"Treino: {X_train.shape[0]} amostras")
print(f"Teste: {X_test.shape[0]} amostras")

Treino: 8000 amostras
Teste: 2000 amostras


In [None]:
# Padronizar features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Treinar modelo de regressão logística
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

In [None]:
# Fazer predições
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

In [None]:
# Fazer predições de probabilidade
y_test_proba = model.predict_proba(X_test_scaled)

In [None]:
# Avaliar modelo
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [None]:
# Resultados
print(f"Acurácia (treino): {train_accuracy:.4f} ({train_accuracy:.2%})")
print(f"Acurácia (teste):  {test_accuracy:.4f} ({test_accuracy:.2%})")

Acurácia (treino): 0.5590 (55.90%)
Acurácia (teste):  0.5805 (58.05%)


In [None]:
# Relatório de classificação detalhado
if 'class_names' in locals():
    target_names = [str(name) for name in class_names]
else:
    target_names = [f"Classe {i}" for i in sorted(np.unique(y))]

print(classification_report(y_test, y_test_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.58      0.61      0.60      1010
         Yes       0.58      0.55      0.56       990

    accuracy                           0.58      2000
   macro avg       0.58      0.58      0.58      2000
weighted avg       0.58      0.58      0.58      2000

