In [None]:
!pip install scikit-learn

In [None]:
!pip install keras
!pip install --upgrade tensorflow

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam

In [None]:
file_paths = [ '../gradebook_data/1.csv' , '../gradebook_data/2.csv', '../gradebook_data/3.csv' , '../gradebook_data/4.csv' ]
gradebooks = [pd.read_csv(file) for file in file_paths]
gradebook_previews = [df.head() for df in gradebooks]
gradebooks

In [None]:
# Applying the preprocessing steps to all datasets
cleaned_gradebooks = []

for df in gradebooks:
    # Dropping rows and columns that are not relevant or are placeholders
    df_cleaned = df.drop(index=[0, 1])  # Drop the first two rows which are placeholders
    df_cleaned = df_cleaned.drop(columns=["ID", "SIS User ID", "SIS Login ID", "Root Account", "Section"])  # Drop identifier columns

    # Handling missing values - Assuming that missing values in grades can be treated as zeros
    df_cleaned = df_cleaned.fillna(0)

    # Convert grades to numeric where possible
    for col in df_cleaned.columns[1:]:  # Skipping the first column which is the student name
        df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce').fillna(0)
    
    cleaned_gradebooks.append(df_cleaned)

# Displaying the first few rows of each cleaned dataset
cleaned_gradebook_previews = [df.head() for df in cleaned_gradebooks]
cleaned_gradebook_previews

In [None]:
# Concatenate the cleaned datasets
concatenated_df = pd.concat(cleaned_gradebooks, ignore_index=True)

# Display the shape of the concatenated dataset and the first few rows
concatenated_df_shape = concatenated_df.shape
concatenated_df_head = concatenated_df.head()

concatenated_df_shape, concatenated_df_head


In [None]:
# Identifying the column for the final score (assuming the last few columns are summary columns)
final_score_column = 'Final Score' if 'Final Score' in concatenated_df.columns else concatenated_df.columns[-3]
concatenated_df['At_Risk'] = concatenated_df[final_score_column] < 60  # 'At_Risk' is True if Final Score is less than 60
concatenated_df = concatenated_df.copy()
# Preparing data for LSTM
# We'll drop columns that are not relevant for LSTM (like final scores, grades, and student identifiers)
lstm_features = concatenated_df.drop(columns=[final_score_column, 'Current Score', 'Unposted Current Score', 
                                              'Unposted Final Score', 'Current Grade', 'Unposted Current Grade', 
                                              'Final Grade', 'Unposted Final Grade', 'Student'])

# For LSTM, we need to ensure all sequences (rows) are of the same length
# We'll pad shorter sequences with zeros
max_sequence_length = lstm_features.shape[1]
lstm_features = lstm_features.to_numpy()

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(lstm_features, concatenated_df['At_Risk'], test_size=0.2, random_state=42)

# Displaying the shapes of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(25))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
X_test_reshaped.shape

In [None]:
X_test_reshaped = X_test_reshaped.astype('float32')

In [None]:
# Making predictions
predictions = model.predict(X_test_reshaped)
threshold = 0.3  # Example threshold, adjust as needed
predicted_labels = (predictions > threshold).astype(int)


In [None]:
predicted_labels

In [None]:
!pip install seaborn

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Convert y_test to binary if it's not already
y_test_binary = y_test.astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test_binary, predicted_labels)
precision = precision_score(y_test_binary, predicted_labels)
recall = recall_score(y_test_binary, predicted_labels)
f1 = f1_score(y_test_binary, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Confusion Matrix
cm = confusion_matrix(y_test_binary, predicted_labels)
sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()