Project 1: Text Complexity and Readability Evaluation of Generated Text

Objective: Evaluate the readability of text generated by the Markov Chain and compare it to that of human-generated text. 
Process
Use Markov Chains to generate text based on different training data set.
Calculate readability scores (e.g., Flesch-Kincaid, Gunning Fog Index) for both generated and real tex s.
Compare readability across text samples to see if your generator can mimic the complexity of real-world t xt.
Evaluaion:
Plot readability scores and analyze them statisti ally.
Assess the variance in complexity between generated and original texts for diverse data s  ources.

In [107]:
%%writefile MarkovChain.py
import streamlit as st
import pandas as pd
import numpy as np
import random
import re
import textstat
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import ttest_ind
from io import StringIO

# --- Page Configuration ---
st.set_page_config(page_title="Text Complexity & Readability Evaluation", layout="wide")

# --- Custom CSS for styling ---
st.markdown("""
    <style>
    .stApp {
        background-color: #f5f5f5;
    }
    /* Custom button colors using CSS selectors on the horizontal block */
    div[data-testid="stHorizontalBlock"] > div:nth-child(1) button {
      background-color: #ff4b4b;
      color: white;
    }
    div[data-testid="stHorizontalBlock"] > div:nth-child(2) button {
      background-color: #4bff4b;
      color: white;
    }
    div[data-testid="stHorizontalBlock"] > div:nth-child(3) button {
      background-color: #4b4bff;
      color: white;
    }
    div[data-testid="stHorizontalBlock"] > div:nth-child(4) button {
      background-color: #ffa500;
      color: white;
    }
    </style>
    """, unsafe_allow_html=True)

# --- Sidebar Options ---
st.sidebar.header("Configuration Options")
input_method = st.sidebar.radio("Select Input Method", ("Upload Text File", "Paste Text"))

if input_method == "Upload Text File":
    uploaded_file = st.sidebar.file_uploader("Upload a .txt file", type=["txt"])
    if uploaded_file:
        raw_text = uploaded_file.read().decode("utf-8")
    else:
        raw_text = None
else:
    raw_text = st.sidebar.text_area("Paste your text here", height=200)

# --- Control Cover Page Display ---
if "data_uploaded" not in st.session_state:
    st.session_state["data_uploaded"] = False

if raw_text:
    st.session_state["data_uploaded"] = True

cover_image_url = "https://fiverr-res.cloudinary.com/images/q_auto,f_auto/gigs/331947776/original/81fbd94368e4ecdeaa7d502528ab7cebb7a6df58/convert-ai-generated-text-into-authentic-human-writing.png"

# Display cover page only when no data is uploaded
if not st.session_state["data_uploaded"]:
    cover_html = f"""
    <div style="position: relative; text-align: left; color: white;">
      <img src="{cover_image_url}" alt="Cover Image" style="width: 100%; opacity: 0.8;">
      <div style="position: absolute; top: 10%; left: 5%; transform: none;">
        <h1 style="font-size: 3em; margin: 0; color: red;">Text Complexity & Readability Evaluation</h1>
        <p style="font-size: 1.5em; margin-top: 10px; max-width: 90%; line-height: 1.4; color: white;">
          This project evaluates the complexity and readability of text generated by a Markov Chain model versus human-generated text. 
          Adjust the n-gram order, number of generated words, and number of samples. Explore various readability metrics through interactive visualizations.
        </p>
      </div>
    </div>
    """
    st.markdown(cover_html, unsafe_allow_html=True)

# --- Additional Configuration Options (visible once data is uploaded) ---
ngram_order = st.sidebar.slider("Choose n-gram order", min_value=2, max_value=4, value=2, step=1)
num_words_to_generate = st.sidebar.slider("Words to generate", 20, 200, 50, step=10)
num_samples = st.sidebar.slider("Number of samples", 3, 20, 5, step=1)
show_advanced_metrics = st.sidebar.checkbox("Show Advanced Readability Metrics", value=True)

# --- Helper Functions ---
def build_ngram_model(text, n):
    """Build an n-gram Markov Chain model."""
    text_clean = re.sub(r"[^a-zA-Z\s]", "", text)
    words = text_clean.lower().split()
    model = {}
    for i in range(len(words) - n):
        key = tuple(words[i:i+n])
        next_word = words[i+n]
        model.setdefault(key, []).append(next_word)
    return model

def generate_text_from_model(model, start_tuple, num_words):
    """Generate text from the n-gram model."""
    current_tuple = start_tuple
    output = list(current_tuple)
    for _ in range(num_words - len(current_tuple)):
        next_words = model.get(current_tuple, None)
        if not next_words:
            break
        next_word = random.choice(next_words)
        output.append(next_word)
        current_tuple = tuple(output[-len(current_tuple):])
    return " ".join(output)

def calculate_readability(text, advanced=False):
    """Calculate readability metrics (excluding SMOG)."""
    scores = {
        "Flesch-Kincaid": textstat.flesch_kincaid_grade(text),
        "Gunning Fog": textstat.gunning_fog(text),
        "ARI": textstat.automated_readability_index(text)
    }
    if advanced:
        scores["Coleman-Liau"] = textstat.coleman_liau_index(text)
        scores["Linsear Write"] = textstat.linsear_write_formula(text)
        scores["Dale Chall"] = textstat.dale_chall_readability_score(text)
    return scores

# --- Main Logic ---
if raw_text:
    st.subheader("Original Text Preview")
    st.text_area("Original Text (first 1000 characters)", raw_text[:1000], height=200)

    # --- Build Markov Model ---
    markov_model = build_ngram_model(raw_text, ngram_order)
    unique_keys = list(markov_model.keys())
    if not unique_keys:
        st.error("Not enough data to build the Markov model. Please upload a larger text.")
    else:
        # --- Generate Machine-Generated Samples ---
        machine_texts = [generate_text_from_model(markov_model, random.choice(unique_keys), num_words_to_generate) 
                         for _ in range(num_samples)]
        
        # --- Prepare Human Text Samples ---
        lines = raw_text.splitlines()
        if len(lines) < num_samples:
            human_texts = [raw_text] * num_samples
        else:
            chunk_size = len(lines) // num_samples
            human_texts = [" ".join(lines[i*chunk_size:(i+1)*chunk_size]) for i in range(num_samples)]
        
        # --- Compute Readability Scores ---
        human_scores = [calculate_readability(text, advanced=show_advanced_metrics) for text in human_texts]
        machine_scores = [calculate_readability(text, advanced=show_advanced_metrics) for text in machine_texts]

        # --- Combine Results into DataFrame ---
        human_data = [{"Type": "Human", **scores} for scores in human_scores]
        machine_data = [{"Type": "Machine", **scores} for scores in machine_scores]
        df_scores = pd.DataFrame(human_data + machine_data)

        # --- Prepare T-test results for later use ---
        t_test_results = {}
        for metric in ["Flesch-Kincaid", "Gunning Fog", "ARI"]:
            human_vals = df_scores[df_scores["Type"] == "Human"][metric]
            machine_vals = df_scores[df_scores["Type"] == "Machine"][metric]
            t_stat, p_val = ttest_ind(human_vals, machine_vals, nan_policy="omit")
            t_test_results[metric] = {"T-Statistic": t_stat, "P-Value": p_val}

        # --- Section Navigation Buttons with Different Colors ---
        if "current_section" not in st.session_state:
            st.session_state.current_section = "Overview"

        col1, col2, col3, col4 = st.columns(4)
        if col1.button("Overview"):
            st.session_state.current_section = "Overview"
        if col2.button("Detailed Analysis"):
            st.session_state.current_section = "Detailed Analysis"
        if col3.button("Advanced Visualizations"):
            st.session_state.current_section = "Advanced Visualizations"
        if col4.button("Download Results"):
            st.session_state.current_section = "Download Results"

        st.markdown(f"### {st.session_state.current_section}")

        # --- Render Section Content Based on Selection ---
        if st.session_state.current_section == "Overview":
            st.markdown("#### Readability Scores Summary")
            st.dataframe(df_scores)
            
            st.markdown("#### Readability Comparison Chart")
            melted_df = df_scores.melt(id_vars=["Type"], var_name="Metric", value_name="Score")
            plt.figure(figsize=(10,6))
            sns.barplot(data=melted_df, x="Metric", y="Score", hue="Type", palette="viridis")
            plt.title("Readability Score Comparison", color='red')
            plt.xticks(rotation=30)
            st.pyplot(plt)
            
            st.markdown("#### Sample Texts")
            st.markdown("**Human Text Sample:**")
            st.write(human_texts[0])
            st.markdown("**Machine-Generated Text Sample:**")
            st.write(machine_texts[0])
        
        elif st.session_state.current_section == "Detailed Analysis":
            st.markdown("#### Statistical Analysis (T-test)")
            t_test_df = pd.DataFrame.from_dict(t_test_results, orient="index").reset_index()
            t_test_df.rename(columns={"index": "Metric"}, inplace=True)
            st.dataframe(t_test_df)
            
            st.markdown("#### Correlation Matrix")
            corr = df_scores.drop("Type", axis=1).corr()
            plt.figure(figsize=(8,6))
            sns.heatmap(corr, annot=True, cmap="coolwarm")
            plt.title("Correlation Matrix", color='navy')
            st.pyplot(plt)
            
            st.markdown("#### Boxplots of Readability Metrics")
            melted_df = df_scores.melt(id_vars=["Type"], var_name="Metric", value_name="Score")
            plt.figure(figsize=(10,6))
            sns.boxplot(data=melted_df, x="Metric", y="Score", hue="Type", palette="Set2")
            plt.title("Boxplot Comparison of Readability Metrics", color='darkgreen')
            st.pyplot(plt)
        
        elif st.session_state.current_section == "Advanced Visualizations":
            st.markdown("#### Interactive Scatter Matrix")
            fig_scatter = px.scatter_matrix(
                df_scores,
                dimensions=df_scores.columns.drop("Type"),
                color="Type",
                title="Scatter Matrix of Readability Metrics"
            )
            st.plotly_chart(fig_scatter, use_container_width=True)
            
            st.markdown("#### Violin Plot for Selected Metric")
            metric_selected = st.selectbox("Select a metric for distribution", ["Flesch-Kincaid", "Gunning Fog", "ARI"])
            fig_violin = px.violin(
                df_scores, x="Type", y=metric_selected, box=True, points="all", 
                title=f"Distribution of {metric_selected} Scores"
            )
            st.plotly_chart(fig_violin, use_container_width=True)
            
            st.markdown("#### Histogram & Density Plot")
            plt.figure(figsize=(10,6))
            for t in df_scores["Type"].unique():
                subset = df_scores[df_scores["Type"] == t]
                sns.kdeplot(subset[metric_selected], label=t, shade=True)
            plt.title(f"Density Plot for {metric_selected}", color='purple')
            st.pyplot(plt)
            
            st.markdown("#### Radar Chart: Average Readability Scores")
            base_metrics = ["Flesch-Kincaid", "Gunning Fog", "ARI"]
            if show_advanced_metrics:
                all_metrics = base_metrics + ["Coleman-Liau", "Linsear Write", "Dale Chall"]
            else:
                all_metrics = base_metrics
                
            avg_human = df_scores[df_scores["Type"]=="Human"][all_metrics].mean()
            avg_machine = df_scores[df_scores["Type"]=="Machine"][all_metrics].mean()
            radar_df = pd.DataFrame({
                "Metric": all_metrics,
                "Human": [avg_human.get(m, np.nan) for m in all_metrics],
                "Machine": [avg_machine.get(m, np.nan) for m in all_metrics]
            })
            categories = radar_df["Metric"].tolist()
            fig_radar = go.Figure()
            fig_radar.add_trace(go.Scatterpolar(
                r=radar_df["Human"].tolist(),
                theta=categories,
                fill='toself',
                name='Human'
            ))
            fig_radar.add_trace(go.Scatterpolar(
                r=radar_df["Machine"].tolist(),
                theta=categories,
                fill='toself',
                name='Machine'
            ))
            fig_radar.update_layout(
                polar=dict(
                    radialaxis=dict(
                        visible=True,
                        range=[min(radar_df[["Human", "Machine"]].min()), max(radar_df[["Human", "Machine"]].max())]
                    )
                ),
                showlegend=True,
                title="Average Readability Scores Radar Chart"
            )
            st.plotly_chart(fig_radar, use_container_width=True)
            
            st.markdown("#### Line Chart: Readability Scores Across Samples")
            metric_selected_line = st.selectbox("Select a metric for line chart", base_metrics, key="line_metric")
            line_df = df_scores.copy()
            line_df["Sample"] = range(1, len(line_df)+1)
            plt.figure(figsize=(10,6))
            sns.lineplot(data=line_df, x="Sample", y=metric_selected_line, hue="Type", marker="o")
            plt.title(f"{metric_selected_line} Across Samples", color='teal')
            st.pyplot(plt)
            
            st.markdown("#### CDF Plot: Cumulative Distribution of Readability Scores")
            metric_selected_cdf = st.selectbox("Select a metric for CDF plot", base_metrics, key="cdf_metric")
            plt.figure(figsize=(10,6))
            for t in df_scores["Type"].unique():
                subset = df_scores[df_scores["Type"]==t][metric_selected_cdf].dropna().sort_values()
                cdf = np.linspace(0, 1, len(subset))
                plt.plot(subset, cdf, label=t)
            plt.title(f"CDF of {metric_selected_cdf} Scores", color='darkblue')
            plt.xlabel(metric_selected_cdf)
            plt.ylabel("CDF")
            plt.legend()
            st.pyplot(plt)
        
        elif st.session_state.current_section == "Download Results":
            st.markdown("#### Download Readability Scores")
            csv_scores = df_scores.to_csv(index=False)
            st.download_button("Download Readability Scores CSV", data=csv_scores, file_name="readability_scores.csv", mime="text/csv")
            
            st.markdown("#### Download T-test Results")
            csv_ttest = pd.DataFrame.from_dict(t_test_results, orient="index").reset_index().rename(columns={"index": "Metric"}).to_csv(index=False)
            st.download_button("Download T-test Results CSV", data=csv_ttest, file_name="ttest_results.csv", mime="text/csv")
else:
    st.info("⚠️ Please upload or paste a text file to begin analysis.")


Overwriting MarkovChain.py
