In [20]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC 

In [21]:
csv_path = Path("data.csv")
data = pd.read_csv(csv_path)

In [22]:
data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [23]:
data['Target'].value_counts()

Target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64

In [24]:
X = data.copy()
X = data.drop(columns = 'Target')

In [25]:
y = data['Target']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [28]:
scaler = StandardScaler()

In [29]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [30]:
logistic_regression_model = LogisticRegression()

In [31]:
logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
train_score = logistic_regression_model.score(X_train_scaled, y_train)
test_score = logistic_regression_model.score(X_test_scaled, y_test)



In [33]:
print(f"Training Accuracy: {train_score:.4f}")
print(f"Testing Accuracy: {test_score:.4f}")

Training Accuracy: 0.7131
Testing Accuracy: 0.6980


In [34]:
y_prediction = logistic_regression_model.predict(X_test_scaled)

results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_prediction})
print(results_df.head(10))

        Actual Predicted
1255   Dropout   Dropout
3458  Graduate  Graduate
3390  Graduate  Graduate
1497  Graduate  Graduate
1536   Dropout   Dropout
287   Graduate   Dropout
3416  Enrolled  Graduate
1366   Dropout  Graduate
3926  Graduate  Graduate
1055  Graduate  Graduate




In [35]:
acc = accuracy_score(y_test, y_prediction)
print(f'Model Accuracy: {acc:.4f}')

Model Accuracy: 0.6980


In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the data
csv_path = Path("data.csv")
df = pd.read_csv(csv_path)

# Rename the "Daytime/evening attendance" column to remove the space
df.rename(columns={'Daytime/evening attendance ': 'Daytime/evening attendance'}, inplace=True)

# Data Cleaning and Preprocessing
# Check for missing values
print("Missing values before imputation:\n", df.isnull().sum())

# Impute missing values using the mean for numeric columns
for col in df.select_dtypes(include=np.number).columns:
    df[col] = df[col].fillna(df[col].mean())

# Impute missing values using the mode for object columns
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].fillna(df[col].mode()[0])

print("Missing values after imputation:\n", df.isnull().sum())

# Check data types of each column
print("Data types of each column:\n", df.dtypes)

# Convert columns to appropriate data types (if needed)
for col in df.columns:
    if df[col].dtype == 'object':
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError:
            print(f"Could not convert {col} to numeric: {col}")

# Encode the target variable
encoder = LabelEncoder()
df['Target'] = encoder.fit_transform(df['Target'])

# Convert categorical features to dummy variables
categorical_cols = ['Marital status', 'Application mode', 'Course', 'Daytime/evening attendance\t', 'Previous qualification', "Nacionality", "Mother's qualification", "Father's qualification", "Gender", "Scholarship holder", "International"]
df = pd.get_dummies(df, columns=categorical_cols, dummy_na=False)

# Separate features (X) and target (y)
X = df.drop("Target", axis=1)
y = df["Target"]

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform hyperparameter tuning using RandomizedSearchCV
rand_search = RandomizedSearchCV(estimator=rf,
                                 param_distributions=param_grid,
                                 cv=5,
                                 n_iter=10,
                                 random_state=42,
                                 error_score='raise')

# Perform the search on the training data
rand_search.fit(X_train, y_train)

# Get the best model and its parameters
best_rf = rand_search.best_estimator_
best_params = rand_search.best_params_

# Evaluate the best model on the test set
y_pred = best_rf.predict(X_test)

# Print the results
print("Best Parameters:", best_params)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Missing values before imputation:
 Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance\t                      0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder           