In [1]:
%%writefile app.py
import streamlit as st
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import PyPDF2
import docx

# -----------------------------
# Function to extract text
# -----------------------------
def extract_text_from_pdf(file):
    text = ""
    reader = PyPDF2.PdfReader(file)
    for page in reader.pages:
        text += page.extract_text() + " "
    return text

def extract_text_from_docx(file):
    doc = docx.Document(file)
    return " ".join([para.text for para in doc.paragraphs])

# -----------------------------
# Streamlit App
# -----------------------------
st.title("📄 Document Visualizer (Word & PDF)")

uploaded_file = st.file_uploader("Upload a Word (.docx) or PDF file", type=["docx", "pdf"])

if uploaded_file:
    # Extract text
    if uploaded_file.type == "application/pdf":
        text = extract_text_from_pdf(uploaded_file)
    else:
        text = extract_text_from_docx(uploaded_file)

    st.subheader("📑 Extracted Text (Preview)")
    st.text_area("Document Content", text[:2000] + ("..." if len(text) > 2000 else ""), height=200)

    if text.strip():
        words = text.lower().split()
        word_counts = Counter(words)

        # -----------------------------
        # 1. Word Cloud
        # -----------------------------
        st.subheader("☁️ Word Cloud")
        wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
        fig, ax = plt.subplots()
        ax.imshow(wordcloud, interpolation="bilinear")
        ax.axis("off")
        st.pyplot(fig)

        # -----------------------------
        # 2. Bar Chart (Top Words)
        # -----------------------------
        st.subheader("📊 Top 10 Words Frequency")
        top_words = word_counts.most_common(10)
        words_, counts_ = zip(*top_words)
        fig, ax = plt.subplots()
        ax.bar(words_, counts_)
        plt.xticks(rotation=45)
        st.pyplot(fig)

        # -----------------------------
        # 3. Line Chart (Word Length Distribution)
        # -----------------------------
        st.subheader("📈 Word Length Distribution")
        word_lengths = [len(w) for w in words if w.isalpha()]
        length_counts = Counter(word_lengths)
        lengths, counts = zip(*sorted(length_counts.items()))
        fig, ax = plt.subplots()
        ax.plot(lengths, counts, marker="o")
        ax.set_xlabel("Word Length")
        ax.set_ylabel("Frequency")
        st.pyplot(fig)


Writing app.py


In [None]:
!streamlit run app.py

In [None]:
pip install streamlit