In [22]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [9]:
%pip install python-docx



## Data Loading

### Tasks:


*   Upload docx file.
*   Enter the pages where the table is located in.




In [10]:
from google.colab import files

uploaded = files.upload()

Saving testSyllabus1.docx to testSyllabus1.docx


In [11]:
start_page = input("Please enter the starting page number of the table: ")
end_page = input("Please enter the ending page number of the table: ")

Please enter the starting page number of the table: 5
Please enter the ending page number of the table: 8


## Data Preparation

### Tasks:


*   Extracting data from the docx.
*   Identify the tables from the extracted data.
*   Convert extracted data into csv.






In [12]:
import docx

file_name = list(uploaded.keys())[0]
document = docx.Document(file_name)

start_page = int(start_page)
end_page = int(end_page)

pages_content = []
current_page = 1
page_text = []
page_tables = []
found_heading = False

for paragraph in document.paragraphs:
    if "Course Outline" in paragraph.text:
        found_heading = True
        continue

    if found_heading:
        if current_page >= start_page and current_page <= end_page:
            page_text.append(paragraph.text)

for i, table in enumerate(document.tables):
    if i + 1 >= start_page and i + 1 <= end_page:
        pages_content.append(table)

display(f"Content extracted from pages {start_page} to {end_page}.")

'Content extracted from pages 5 to 8.'

In [13]:
target_table = None

for table in pages_content:
    # Assuming the first table after finding the heading within the page range is the target
    # This logic might need refinement depending on the actual document structure and how pages_content was populated
    target_table = table
    break

if target_table:
    display("Target table identified.")
else:
    display("Target table not found within the specified pages after the heading.")

'Target table identified.'

In [18]:
import pandas as pd

if target_table is not None:
    table_data = []
    for row in target_table.rows:
        row_data = []
        for cell in row.cells:
            row_data.append(cell.text)
        table_data.append(row_data)

    df = pd.DataFrame(table_data)
    display(df.head())
    df.to_csv('course_outline.csv', index=False)
else:
    display("No target table found to convert to DataFrame.")

Unnamed: 0,0,1,2,3,4,5,6
0,Week,Topics,Learning Activities,Learning Outcomes,Instructional Materials,Deliverables/\nOutcomes,Assessment
1,1,Course Outline and Class Policies\nOverview of...,Discussion\nTools demo\nCoding exercise\nLinke...,Describe the course outline and class policies...,Class orientation slides\nAPC Handbook\nLinked...,LinkedIn Learning Certificate\nBikeshare pytho...,Quiz\nCoding exercise
2,2,Linear Regression\nCategorical Independent Var...,Code demo\nComputer simulation,Determine the best-fit linear model to a given...,Linear Models slides\nCode samples\n\n,Linear regression model coding assignment,Coding exercise
3,3,Logistic Regression\nProject Overview\nProject...,Discussion\nLinkedIn Learning video viewing on...,Create a logistic regression model given a dat...,Logistic Regression slides\nLinkedIn Learning ...,Logistic regression model coding assignment\nP...,Quiz\nCoding exercise\nProject Deliverable 1
4,4-6,Other nonlinear regression models\nProject Del...,LinkedIn Learning course completion\nLinkedIn ...,Choose an appropriate method in creating a bes...,LinkedIn Learning videos on nonlinear regressi...,Preprocessed project data set\nProject Deliver...,Coding exercise\nQuiz


## Analysis and Scoring

### Tasks:


*   Anaylyse for Bloom's Taxonomy and Semantic Similarity between the three columns.
*   Score them based on the usage of the Bloom's Taxonomy Verbs and Semantic Similarity between the three columns using spaCy.
*   Calculate for OBE Alignment.
*   Categorize them as Misaligned, Slightly-aligned, and Aligned.
*   Save data inside a csv file.







In [20]:
import pandas as pd
import spacy

# Load the small English model for Bloom's verb identification
try:
    nlp_verbs = spacy.load("en_core_web_sm")
except:
    print("Error loading en_core_web_sm model.")
    nlp_verbs = None

# Define the Bloom's verb dictionary
bloom_verbs = {
    "Remembering": ["define", "list", "recall", "identify", "name"],
    "Understanding": ["explain", "describe", "summarize", "interpret"],
    "Applying": ["apply", "use", "solve", "demonstrate", "implement"],
    "Analyzing": ["analyze", "compare", "contrast", "differentiate", "examine"],
    "Evaluating": ["evaluate", "critique", "judge", "recommend", "assess"],
    "Creating": ["create", "design", "develop", "formulate", "generate"]
}

def identify_bloom_levels(text):
    """Identifies Bloom's Taxonomy levels in text using spaCy."""
    if nlp_verbs is None or pd.isna(text):
        return []
    doc = nlp_verbs(str(text))
    found_levels = set()
    for token in doc:
        if token.pos_ == "VERB":
            for level, verbs in bloom_verbs.items():
                if token.lemma_.lower() in verbs:
                    found_levels.add(level)
    return list(found_levels)

# Define a mapping for Bloom's levels to scores (1-5)
bloom_score_map = {
    "Remembering": 1,
    "Understanding": 2,
    "Applying": 3,
    "Analyzing": 4,
    "Evaluating": 5,
    "Creating": 5 # You might consider a higher score for Creating, but 5 is the max requested
}

def score_blooms_levels(bloom_levels):
    """
    Calculates a single Bloom's Taxonomy score from a list of identified levels.
    Returns the maximum score among the identified levels.
    """
    if not bloom_levels:
        return 0 # Return 0 if no Bloom's levels are identified

    scores = [bloom_score_map.get(level, 0) for level in bloom_levels]
    return max(scores) if scores else 0


try:
    # Load the generated CSV file
    df_course_outline = pd.read_csv('course_outline.csv')

    # Identify the correct column names based on the first row of the DataFrame
    header = df_course_outline.iloc[0].tolist()
    df_course_outline.columns = header
    df_course_outline = df_course_outline[1:].reset_index(drop=True)

    # Define the target column names
    target_columns = ['Learning Outcomes', 'Deliverables/\nOutcomes', 'Assessment']

    # Check if all target columns exist in the DataFrame
    if all(col in df_course_outline.columns for col in target_columns):
        # Select the target columns
        df_parsed = df_course_outline[target_columns].copy() # Use .copy() to avoid SettingWithCopyWarning

        # 1. Check for Bloom's Taxonomy levels using spaCy and Score it
        df_parsed['Learning Outcome Levels'] = df_parsed['Learning Outcomes'].apply(identify_bloom_levels)
        df_parsed['Learning Outcome Bloom Score'] = df_parsed['Learning Outcome Levels'].apply(score_blooms_levels)

        df_parsed['Deliverable Levels'] = df_parsed['Deliverables/\nOutcomes'].apply(identify_bloom_levels)
        df_parsed['Deliverable Bloom Score'] = df_parsed['Deliverable Levels'].apply(score_blooms_levels)

        df_parsed['Assessment Levels'] = df_parsed['Assessment'].apply(identify_bloom_levels)
        df_parsed['Assessment Bloom Score'] = df_parsed['Assessment Levels'].apply(score_blooms_levels)


        display("Successfully extracted specified columns and applied Bloom's Taxonomy analysis and scoring.")
    else:
        display("One or more specified columns were not found in 'course_outline.csv'.")

except FileNotFoundError:
    display("Error: 'course_outline.csv' not found. Please run the previous steps to generate it.")
except Exception as e:
    display(f"An error occurred: {e}")

"Successfully extracted specified columns and applied Bloom's Taxonomy analysis and scoring."

In [23]:
import spacy

# Load a larger English model with word vectors
try:
    nlp_similarity = spacy.load("en_core_web_md")
except:
    # Fallback to a smaller model if the larger one is not available
    nlp_similarity = spacy.load("en_core_web_sm")
    print("Warning: Using a smaller model. Semantic similarity results may be less accurate.")


# Function to calculate similarity between two texts
def calculate_similarity(text1, text2):
    if pd.isna(text1) or pd.isna(text2):
        return 0.0  # Return 0 similarity for missing values
    doc1 = nlp_similarity(str(text1)) # Convert to string to handle potential non-string entries
    doc2 = nlp_similarity(str(text2))
    # Calculate similarity between the two documents
    return doc1.similarity(doc2)

# Function to score semantic similarity from 1-10
def score_semantic_similarity(similarity_score):
    """
    Scales a semantic similarity score (0-1) to a whole number score (1-10).
    Adds 1 to ensure the minimum score is 1.
    """
    # Scale the score from 0-1 to 0-9, then add 1 to make it 1-10
    return int(round(similarity_score * 9)) + 1

# Apply the function to calculate similarity between 'Learning Outcomes' and 'Deliverables/Outcomes'
df_parsed['Outcome-Deliverable Similarity'] = df_parsed.apply(
    lambda row: calculate_similarity(row['Learning Outcomes'], row['Deliverables/\nOutcomes']),
    axis=1
)

# Apply the function to calculate similarity between 'Learning Outcomes' and 'Assessment'
df_parsed['Outcome-Assessment Similarity'] = df_parsed.apply(
    lambda row: calculate_similarity(row['Learning Outcomes'], row['Assessment']),
    axis=1
)

# Apply the function to calculate similarity between 'Deliverables/Outcomes' and 'Assessment'
df_parsed['Deliverable-Assessment Similarity'] = df_parsed.apply(
    lambda row: calculate_similarity(row['Deliverables/\nOutcomes'], row['Assessment']),
    axis=1
)

# Apply the scoring function to the similarity columns
df_parsed['Outcome-Deliverable Alignment Score'] = df_parsed['Outcome-Deliverable Similarity'].apply(score_semantic_similarity)
df_parsed['Outcome-Assessment Alignment Score'] = df_parsed['Outcome-Assessment Similarity'].apply(score_semantic_similarity)
df_parsed['Deliverable-Assessment Alignment Score'] = df_parsed['Deliverable-Assessment Similarity'].apply(score_semantic_similarity)


display(df_parsed[['Learning Outcomes', 'Deliverables/\nOutcomes', 'Assessment',
                   'Outcome-Deliverable Similarity', 'Outcome-Deliverable Alignment Score',
                   'Outcome-Assessment Similarity', 'Outcome-Assessment Alignment Score',
                   'Deliverable-Assessment Similarity', 'Deliverable-Assessment Alignment Score']].head())

Unnamed: 0,Learning Outcomes,Deliverables/\nOutcomes,Assessment,Outcome-Deliverable Similarity,Outcome-Deliverable Alignment Score,Outcome-Assessment Similarity,Outcome-Assessment Alignment Score,Deliverable-Assessment Similarity,Deliverable-Assessment Alignment Score
0,Describe the course outline and class policies...,LinkedIn Learning Certificate\nBikeshare pytho...,Quiz\nCoding exercise,0.5122,6,0.726421,8,0.462354,5
1,Determine the best-fit linear model to a given...,Linear regression model coding assignment,Coding exercise,0.781817,8,0.57162,6,0.599752,6
2,Create a logistic regression model given a dat...,Logistic regression model coding assignment\nP...,Quiz\nCoding exercise\nProject Deliverable 1,0.878486,9,0.76347,8,0.76192,8
3,Choose an appropriate method in creating a bes...,Preprocessed project data set\nProject Deliver...,Coding exercise\nQuiz,0.802707,8,0.665184,7,0.618087,7
4,Explore input data for analysis\nApply basic f...,LinkedIn Learning Certificate,In-video quizzes\nProblem solving exercise,0.608834,6,0.717426,7,0.433483,5


In [36]:
# Calculate a combined OBE alignment score
# This is a simple average of the three semantic similarity alignment scores
df_parsed['Combined Semantic Alignment Score'] = df_parsed[[
    'Outcome-Deliverable Alignment Score',
    'Outcome-Assessment Alignment Score',
    'Deliverable-Assessment Alignment Score'
]].mean(axis=1).apply(lambda x: int(x + 0.5)) # Round up and convert to integer

# You could also consider incorporating the Bloom's scores into a combined score
# For example, a weighted average or a separate indicator

def classify_alignment(score):
    """Classifies the combined alignment score into categories."""
    if score >= 1 and score <= 3:
        return "Misaligned"
    elif score >= 4 and score <= 6:
        return "Slightly Aligned"
    elif score >= 7 and score <= 10:
        return "Aligned"
    else:
        return "N/A" # Handle potential unexpected scores

# Apply the classification function to create the new column
df_parsed['Alignment Category'] = df_parsed['OBE Alignment Score'].apply(classify_alignment)

def calculate_overall_obe_alignment_score(row):
    """
    Calculates an overall OBE alignment score by combining Bloom's and semantic scores.
    (Using a simple average for demonstration)
    """
    bloom_score = row['Learning Outcome Bloom Score']
    combined_semantic_score = row['Combined Semantic Alignment Score']

    # You might adjust the scaling or weighting here if needed
    # For a simple average, ensure scores are on a comparable scale.
    # Since Bloom's is 1-5 and Semantic is 1-10, we can normalize or adjust.
    # Let's scale Bloom's score to 1-10 for a simple average with the semantic score.
    scaled_bloom_score = bloom_score * 2 if bloom_score > 0 else 1 # Scale 1-5 to 2-10, 0 becomes 1


    # Simple average of scaled Bloom's and Combined Semantic scores
    overall_score = (scaled_bloom_score + combined_semantic_score) / 2

    # Round up to the nearest whole number as requested for previous scores
    return int(overall_score + 0.5)

# Apply the function to calculate the overall OBE alignment score
df_parsed['OBE Alignment Score'] = df_parsed.apply(calculate_overall_obe_alignment_score, axis=1)


display(df_parsed[['Learning Outcomes', 'Deliverables/\nOutcomes', 'Assessment',
                   'Learning Outcome Bloom Score',
                   'Combined Semantic Alignment Score', 'OBE Alignment Score',
                   'Alignment Category']].head())

# Define the desired column order with 'Alignment Category' at the end
desired_column_order = [col for col in df_parsed.columns if col != 'Alignment Category'] + ['Alignment Category']

# Reindex the DataFrame with the desired column order
df_parsed = df_parsed.reindex(columns=desired_column_order)

# Save the df_parsed DataFrame to a CSV file
df_parsed.to_csv('scored_data.csv', index=False)

display("DataFrame saved to 'scored_data.csv'")

Unnamed: 0,Learning Outcomes,Deliverables/\nOutcomes,Assessment,Learning Outcome Bloom Score,Combined Semantic Alignment Score,OBE Alignment Score,Alignment Category
0,Describe the course outline and class policies...,LinkedIn Learning Certificate\nBikeshare pytho...,Quiz\nCoding exercise,3,6,6,Slightly Aligned
1,Determine the best-fit linear model to a given...,Linear regression model coding assignment,Coding exercise,2,7,6,Slightly Aligned
2,Create a logistic regression model given a dat...,Logistic regression model coding assignment\nP...,Quiz\nCoding exercise\nProject Deliverable 1,5,8,9,Aligned
3,Choose an appropriate method in creating a bes...,Preprocessed project data set\nProject Deliver...,Coding exercise\nQuiz,5,7,9,Aligned
4,Explore input data for analysis\nApply basic f...,LinkedIn Learning Certificate,In-video quizzes\nProblem solving exercise,4,6,7,Aligned


"DataFrame saved to 'scored_data.csv'"

In [37]:
import pandas as pd

# Load the data from the CSV file
df_scored = pd.read_csv('scored_data.csv')


Unnamed: 0,Learning Outcomes,Deliverables/\nOutcomes,Assessment,Learning Outcome Levels,Learning Outcome Bloom Score,Deliverable Levels,Assessment Levels,Outcome-Deliverable Similarity,Outcome-Assessment Similarity,Deliverable-Assessment Similarity,Outcome-Deliverable Alignment Score,Outcome-Assessment Alignment Score,Deliverable-Assessment Alignment Score,Combined Semantic Alignment Score,OBE Alignment Score,Alignment Category
0,Describe the course outline and class policies...,LinkedIn Learning Certificate\nBikeshare pytho...,Quiz\nCoding exercise,"['Remembering', 'Applying', 'Understanding']",3,[],[],0.5122,0.726421,0.462354,6,8,5,6,6,Slightly Aligned
1,Determine the best-fit linear model to a given...,Linear regression model coding assignment,Coding exercise,['Understanding'],2,[],[],0.781817,0.57162,0.599752,8,6,6,7,6,Slightly Aligned
2,Create a logistic regression model given a dat...,Logistic regression model coding assignment\nP...,Quiz\nCoding exercise\nProject Deliverable 1,"['Creating', 'Understanding']",5,[],[],0.878486,0.76347,0.76192,9,8,8,8,9,Aligned
3,Choose an appropriate method in creating a bes...,Preprocessed project data set\nProject Deliver...,Coding exercise\nQuiz,"['Creating', 'Understanding']",5,[],[],0.802707,0.665184,0.618087,8,7,7,7,9,Aligned
4,Explore input data for analysis\nApply basic f...,LinkedIn Learning Certificate,In-video quizzes\nProblem solving exercise,"['Applying', 'Analyzing']",4,[],['Applying'],0.608834,0.717426,0.433483,6,7,5,6,7,Aligned


## Gemini Integration for Summary and Suggestions

### Task:
* Use Gemini to provide insights on the data.

In [38]:
# Import the Python SDK
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

try:
    GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
except:
    print("Please set your GOOGLE_API_KEY in the Colab secrets panel.")

In [49]:
# Initialize the Gemini API
try:
    gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Using 'gemini-1.5-flash-latest' as requested
except Exception as e:
    print(f"Error initializing Gemini model: {e}")
    gemini_model = None

In [None]:
if gemini_model and not df_scored.empty:
    # Choose a row to get an explanation for (e.g., the first row)
    row_index = 0
    row_data = df_scored.iloc[row_index]

    # Access the value for the column with the backslash separately
    deliverables_outcomes_text = row_data['Deliverables/\nOutcomes']

    # Prepare the prompt for the Gemini model
    prompt = f"""
    Analyze the following data from a course outline and explain the scoring for OBE alignment.
    Provide suggestions on how to improve the alignment based on the scores.

    Learning Outcome: {row_data['Learning Outcomes']}
    Deliverable/Outcome: {deliverables_outcomes_text}
    Assessment: {row_data['Assessment']}
    Learning Outcome Bloom Score (1-5): {row_data['Learning Outcome Bloom Score']}
    Combined Semantic Alignment Score (1-10): {row_data['Combined Semantic Alignment Score']}
    OBE Alignment Score (Combined Bloom's and Semantic, 1-10): {row_data['OBE Alignment Score']}
    Alignment Category: {row_data['Alignment Category']}

    Explanation and Suggestions:
    """

    try:
        # Generate content using the Gemini model
        gemini_response = gemini_model.generate_content(prompt).text
        print("Gemini response generated.")
        print("-" * 30)
        print(gemini_response)
        print("-" * 30)

    except Exception as e:
        print(f"Error generating content from Gemini model: {e}")
        gemini_response = "Error generating response from Gemini."
else:
    print("Gemini model not initialized or df_scored is empty.")
    gemini_response = "Could not generate Gemini response."

## Output Into docx

### Task:
* Put the data of "scored_data.csv" and the Gemini response inside a docx file.

In [45]:
import docx

# Create a new Word document
document = docx.Document()

# Add a heading for the data table
document.add_heading('Scored Course Outline Data', 0)

# Add the DataFrame as a table to the document
# We need to iterate through the DataFrame and add rows/cells to the Word table
table = document.add_table(rows=1, cols=len(df_scored.columns))
table.style = 'Table Grid' # Apply a style for better readability

# Add table headers
for i, col_name in enumerate(df_scored.columns):
    table.cell(0, i).text = col_name

# Add data rows
for index, row in df_scored.iterrows():
    cells = table.add_row().cells
    for i, col_name in enumerate(df_scored.columns):
        cells[i].text = str(row[col_name]) # Convert data to string


# Add a heading for the Gemini response
document.add_heading('Gemini Analysis and Suggestions', 0)

# Add the Gemini response text to the document
# Split the response into paragraphs if it contains multiple lines
for paragraph_text in gemini_response.split('\n'):
    document.add_paragraph(paragraph_text)


# Define the output file name
output_filename = "output.docx"

# Save the document
document.save(output_filename)

display(f"Word document '{output_filename}' created successfully.")

"Word document 'output.docx' created successfully."

## Summary:

### Data Analysis Key Findings

*   Markdown headings were successfully added to organize the code into the specified sections: "Data Loading and Preparation", "Analysis and Scoring", "Gemini API Integration", and "Word Document Creation".

### Insights or Next Steps

*   The next step is to execute the code cells under each heading to perform the data extraction, analysis, scoring, API integration, and document creation as outlined in the original task.
