In [None]:
# =================================================================
# RESEARCH ANALYSIS: RAG-BASED STUDENT PERFORMANCE EVALUATION
# Target: Journal of Educational Technology / Data Science
# =================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

# --- 1. DATA LOADING & PREPROCESSING ---
# Loading the primary dataset and the chatbot research logs
df_students = pd.read_csv('/content/StudentPerformanceFactors.csv')
df_logs = pd.read_csv('/content/research_metrics_log.csv')

# Handling missing values in student data
df_students['Teacher_Quality'] = df_students['Teacher_Quality'].fillna('Medium')
df_students['Parental_Education_Level'] = df_students['Parental_Education_Level'].fillna('Unknown')

# Preparing Log Metrics
df_logs['Timestamp'] = pd.to_datetime(df_logs['Timestamp'])
df_logs['Word_Count'] = df_logs['Answer'].str.split().str.len()

print(f"Dataset Loaded: {len(df_students)} student records.")
print(f"Experiments Analyzed: {len(df_logs)} unique RAG queries.")

# --- 2. DESCRIPTIVE ANALYSIS OF STUDENT FACTORS ---
# Analyzing the target variable: Exam_Score
plt.figure(figsize=(10, 5))
sns.histplot(df_students['Exam_Score'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Student Exam Scores')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.savefig('student_score_distribution.png')
plt.close()

# --- 3. RAG SYSTEM PERFORMANCE (LATENCY & EFFICIENCY) ---
# Evaluating response time distribution across 975 runs
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df_logs, x='Latency', fill=True, color='teal')
plt.axvline(df_logs['Latency'].median(), color='red', linestyle='--',
            label=f'Median Latency: {df_logs["Latency"].median():.2f}s')
plt.title('System Latency Distribution (Llama-3.1-8b via Groq)')
plt.xlabel('Latency (seconds)')
plt.legend()
plt.savefig('system_latency_distribution.png')
plt.close()

# --- 4. CATEGORICAL RESEARCH TAXONOMY ---
def categorize_query(q):
    q = str(q).lower()
    if any(k in q for k in ['average', 'mean', 'score', 'percentage']): return 'Statistical'
    if any(k in q for k in ['correlation', 'impact', 'relationship', 'effect']): return 'Analytical'
    if any(k in q for k in ['gender', 'income', 'parental', 'school']): return 'Demographic'
    return 'General'

df_logs['Category'] = df_logs['Query'].apply(categorize_query)

# Visualizing response depth (Word Count) by query type
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_logs, x='Category', y='Word_Count', palette='viridis')
plt.title('Answer Information Depth by Query Category')
plt.ylabel('Word Count per Response')
plt.savefig('query_category_word_count.png')
plt.close()

# --- 5. CORRELATION ANALYSIS (FOR THE PAPER) ---
# Calculating the impact of categorical factors on Exam Scores
factors = ['Parental_Involvement', 'Access_to_Resources', 'Motivation_Level']
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, factor in enumerate(factors):
    sns.boxplot(x=factor, y='Exam_Score', data=df_students, ax=axes[i], order=['Low', 'Medium', 'High'])
    axes[i].set_title(f'Impact of {factor}')

plt.tight_layout()
plt.savefig('categorical_factors_impact.png')
plt.close()

# --- 6. STATISTICAL SIGNIFICANCE TESTING ---
# T-Test: Internet Access vs Exam Score
internet_yes = df_students[df_students['Internet_Access'] == 'Yes']['Exam_Score']
internet_no = df_students[df_students['Internet_Access'] == 'No']['Exam_Score']
t_stat, p_val = stats.ttest_ind(internet_yes, internet_no)

print(f"\n--- Statistical Significance Report ---")
print(f"Internet Access T-Test p-value: {p_val:.4f}")
if p_val < 0.05:
    print("Result: Statistically Significant difference found based on Internet Access.")

# --- 7. FINAL METRICS SUMMARY FOR TABLES ---
summary_table = df_logs.groupby('Category').agg({
    'Latency': ['mean', 'std'],
    'Word_Count': ['mean', 'max']
}).round(2)

print("\n--- Final System Performance Summary Table ---")
print(summary_table)

# Exporting cleaned results for further use in LaTeX
df_logs.to_csv('final_analysis_results.csv', index=False)

Dataset Loaded: 6607 student records.
Experiments Analyzed: 975 unique RAG queries.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_logs, x='Category', y='Word_Count', palette='viridis')



--- Statistical Significance Report ---
Internet Access T-Test p-value: 0.0000
Result: Statistically Significant difference found based on Internet Access.

--- Final System Performance Summary Table ---
            Latency        Word_Count      
               mean    std       mean   max
Category                                   
Analytical    11.17  15.92     394.76  1034
Demographic   10.06  14.04     437.02  6780
General       10.10  14.45     343.78  5903
Statistical   10.72  15.32     393.32  6860
