In [None]:
# %pip install pandas transformers spacy torch joblib
# spacy.cli.download("en_core_web_sm")  # Ensure the model is downloaded

In [1]:
import json
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import spacy
import glob
import os
import torch

# Load sentiment-analysis and zero-shot-classification pipelines with GPU support
device = 0 if torch.cuda.is_available() else -1
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", device=device)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Load spaCy for clarity and complexity checks
nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu
Device set to use cpu


In [2]:
def analyze_sentiment(question):
    result = sentiment_pipeline(question)[0]
    return result['label'], result['score']

def check_relevance(question, topic):
    result = classifier(question, [topic])
    topic = result['labels'][0]
    score = result['scores'][0]
    return topic, score

def evaluate_clarity(question):
    doc = nlp(question)
    num_tokens = len(doc)
    num_complex_words = sum(1 for token in doc if token.is_alpha and len(token.text) > 6)
    return num_tokens, num_complex_words

def evaluate_question(question, topic):
    sentiment, sentiment_score = analyze_sentiment(question)
    topic, relevance_score = check_relevance(question, topic)
    num_tokens, num_complex_words = evaluate_clarity(question)
    
    clarity_score = 1 - (num_complex_words / num_tokens)  # A simple clarity score
    effectiveness_score = (relevance_score + clarity_score) / 2
    
    return {
        "question": question,
        "sentiment": sentiment,
        "sentiment_score": sentiment_score,
        "topic": topic,
        "relevance_score": relevance_score,
        "num_tokens": num_tokens,
        "num_complex_words": num_complex_words,
        "clarity_score": clarity_score,
        "effectiveness_score": effectiveness_score
    }

In [3]:
# Define the threshold value for filtering
THRESHOLD_VALUE = 0.7

# Directory containing the JSONL files
jsonl_directory = 'coding_jsonl'

# List of keywords to exclude
exclude_keywords = ['certificate', 'projects', 'rosetta']

# Get the list of JSONL files and filter out those with keywords in the name
jsonl_files = [
    f for f in glob.glob(f"{jsonl_directory}/*.jsonl") if not any(keyword in f for keyword in exclude_keywords)
]

# Output file
output_file = 'all_data.jsonl'

# Open the output file where we will append valid data
output_file_handle = open(output_file, 'w')

In [None]:
# Function to validate the structure of the DataFrame
def is_valid_dataframe(df):
    if 'messages' not in df.columns:
        return False
    for messages in df['messages']:
        if not isinstance(messages, list):
            return False
        roles = [message.get('role') for message in messages]
        if 'user' not in roles or 'assistant' not in roles:
            return False
    return True

n = len(jsonl_files)

# Iterate over each JSONL file
for i, filename in enumerate(jsonl_files, start=1):

    print(f"[INFO] START PROCESSING FILE {i}/{n}: {filename}")
    topic = os.path.basename(filename).replace('.jsonl', '').replace('-', ' ')

    data = []
    with open(filename, 'r') as file:
        for line in file:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filename}: {e}")
                continue

    df = pd.json_normalize(data)
    
    # Validate the DataFrame
    if not is_valid_dataframe(df):
        print(f"Skipping file {filename} due to invalid structure.")
        continue

    # Extract relevant fields
    questions = []
    answers = []
    for _, row in df.iterrows():
        messages = row['messages']
        user_message = next((msg['content'] for msg in messages if msg['role'] == 'user'), None)
        assistant_message = next((msg['content'] for msg in messages if msg['role'] == 'assistant'), None)
        if user_message and assistant_message:
            questions.append(user_message)
            answers.append(assistant_message)

    if len(questions) != len(answers):
        print(f"Inconsistent number of questions and answers in file {filename}. Skipping this file.")
        continue

    qna_df = pd.DataFrame({'question': questions, 'answer': answers})

    # Evaluate each question sequentially
    evaluations = []
    for question in qna_df['question']:
        evaluations.append(evaluate_question(question, topic))

    # Attach evaluations to the DataFrame
    qna_df['evaluation'] = evaluations

    # Filter rows with effectiveness score greater than THRESHOLD_VALUE
    filtered_qna_df = qna_df[qna_df['evaluation'].apply(lambda x: x['effectiveness_score']) > THRESHOLD_VALUE]
    print(f"[INFO] {len(filtered_qna_df)}/{len(qna_df)} questions were evaluated as effective")

    # Append valid Q&A pairs to the output file
    for index, row in filtered_qna_df.iterrows():
        valid_record = {
            "messages": [
                {"role": "system", "content": "You are an expert teacher in coding skills for full-stack coding students."},
                {"role": "user", "content": row['question']},
                {"role": "assistant", "content": row['answer']}
            ]
        }
        output_file_handle.write(json.dumps(valid_record) + '\n')

    print(f"[INFO] FINISH PROCESSING FILE {i}/{n}: {filename}")
    print('------------------------------------------------------------------------------------\n')

[INFO] START PROCESSING FILE 1/33: coding_jsonl\advanced-express-tools.jsonl
[INFO] 145/812 questions were evaluated as effective
[INFO] FINISH PROCESSING FILE 1/33: coding_jsonl\advanced-express-tools.jsonl
------------------------------------------------------------------------------------

[INFO] START PROCESSING FILE 2/33: coding_jsonl\algorithms.jsonl
[INFO] 62/104 questions were evaluated as effective
[INFO] FINISH PROCESSING FILE 2/33: coding_jsonl\algorithms.jsonl
------------------------------------------------------------------------------------

[INFO] START PROCESSING FILE 3/33: coding_jsonl\applied-accessibility.jsonl


In [None]:
# # Close the output file handle
# output_file_handle.close()
# print(f"[INFO] Filtered data has been written to {output_file}")