In [17]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# Load the data
data= pd.read_csv('student-data4.csv')

# Explore the data
print("Data Head:\n", data.head())
print("\nData Info:\n", data.info())
print("\nData Description:\n", data.describe())

# Define features and target
X = data.drop('passed', axis=1)
y = data['passed']

# Preprocess the data
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', LogisticRegression())])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)



# Save the model
joblib.dump(pipeline, 'student_pass_predictor.pkl')

sample_data = pd.DataFrame({
    'sex': ['F'],
    'age': [18],
    'address': ['U'],
    'studytime': [2],
    'schoolsup': ['yes'],
    'famsup': ['no'],
    'internet': ['no']
})

print("\nSample Data:\n", sample_data)

# Load the model
model = joblib.load('student_pass_predictor.pkl')

# Predict the outcome for the sample data
sample_prediction = model.predict(sample_data)
print(sample_prediction)
print(f'\nPredicted outcome: {sample_prediction[0]}')

Data Head:
   sex  age address  studytime schoolsup famsup internet passed
0   F   18       U          2       yes     no       no     no
1   F   17       U          2        no    yes      yes     no
2   F   15       U          2       yes     no      yes    yes
3   F   15       U          3        no    yes      yes    yes
4   F   16       U          2        no    yes       no    yes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sex        395 non-null    object
 1   age        395 non-null    int64 
 2   address    395 non-null    object
 3   studytime  395 non-null    int64 
 4   schoolsup  395 non-null    object
 5   famsup     395 non-null    object
 6   internet   395 non-null    object
 7   passed     395 non-null    object
dtypes: int64(2), object(6)
memory usage: 24.8+ KB

Data Info:
 None

Data Description:
               age   studyti