In [None]:
# Python Notebook
print("Hello, World!")


In [None]:
# Install dependencies
%pip install flask pandas scikit-learn numpy

In [3]:
from flask import Flask, render_template_string, request, jsonify
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from io import StringIO
import json

app = Flask(__name__)

# Global variables to store data and model
dataset = None
tfidf_vectorizer = None
tfidf_matrix = None

HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
    <title>Duplicate Finder</title>
    <style>
        * {
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
            padding: 20px;
        }
        .container {
            max-width: 1200px;
            margin: 0 auto;
            background: white;
            border-radius: 15px;
            box-shadow: 0 20px 60px rgba(0,0,0,0.3);
            overflow: hidden;
        }
        .header {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 30px;
            text-align: center;
        }
        .header h1 {
            font-size: 2.5em;
            margin-bottom: 10px;
        }
        .header p {
            font-size: 1.1em;
            opacity: 0.9;
        }
        .content {
            padding: 40px;
        }
        .section {
            margin-bottom: 30px;
        }
        .section-title {
            font-size: 1.5em;
            color: #667eea;
            margin-bottom: 15px;
            padding-bottom: 10px;
            border-bottom: 2px solid #667eea;
        }
        .upload-area {
            border: 3px dashed #667eea;
            border-radius: 10px;
            padding: 30px;
            text-align: center;
            background: #f8f9ff;
            transition: all 0.3s;
        }
        .upload-area:hover {
            background: #eef0ff;
            border-color: #764ba2;
        }
        input[type="file"] {
            display: none;
        }
        .upload-btn {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 15px 30px;
            border: none;
            border-radius: 25px;
            font-size: 1.1em;
            cursor: pointer;
            transition: transform 0.2s;
        }
        .upload-btn:hover {
            transform: scale(1.05);
        }
        .input-group {
            margin-bottom: 20px;
        }
        label {
            display: block;
            margin-bottom: 8px;
            color: #333;
            font-weight: 600;
        }
        input[type="text"], input[type="number"], textarea, select {
            width: 100%;
            padding: 12px;
            border: 2px solid #e0e0e0;
            border-radius: 8px;
            font-size: 1em;
            transition: border-color 0.3s;
        }
        input[type="text"]:focus, input[type="number"]:focus, textarea:focus, select:focus {
            outline: none;
            border-color: #667eea;
        }
        textarea {
            resize: vertical;
            min-height: 100px;
        }
        .check-btn {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 15px 40px;
            border: none;
            border-radius: 25px;
            font-size: 1.1em;
            cursor: pointer;
            transition: transform 0.2s;
            width: 100%;
        }
        .check-btn:hover {
            transform: translateY(-2px);
            box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
        }
        .check-btn:disabled {
            background: #ccc;
            cursor: not-allowed;
            transform: none;
        }
        .results {
            margin-top: 30px;
            padding: 20px;
            background: #f8f9ff;
            border-radius: 10px;
            display: none;
        }
        .results.show {
            display: block;
        }
        .alert {
            padding: 15px 20px;
            border-radius: 8px;
            margin-bottom: 20px;
            font-size: 1.1em;
            font-weight: 600;
        }
        .alert-success {
            background: #d4edda;
            color: #155724;
            border-left: 5px solid #28a745;
        }
        .alert-warning {
            background: #fff3cd;
            color: #856404;
            border-left: 5px solid #ffc107;
        }
        .duplicate-item {
            background: white;
            padding: 15px;
            margin-bottom: 15px;
            border-radius: 8px;
            border-left: 4px solid #ffc107;
        }
        .similarity-score {
            display: inline-block;
            background: #667eea;
            color: white;
            padding: 5px 15px;
            border-radius: 15px;
            font-weight: 600;
            margin-left: 10px;
        }
        .dataset-info {
            background: #e8f5e9;
            padding: 15px;
            border-radius: 8px;
            margin-bottom: 20px;
            display: none;
        }
        .dataset-info.show {
            display: block;
        }
        .loading {
            text-align: center;
            padding: 20px;
            display: none;
        }
        .loading.show {
            display: block;
        }
        .spinner {
            border: 4px solid #f3f3f3;
            border-top: 4px solid #667eea;
            border-radius: 50%;
            width: 40px;
            height: 40px;
            animation: spin 1s linear infinite;
            margin: 0 auto;
        }
        @keyframes spin {
            0% { transform: rotate(0deg); }
            100% { transform: rotate(360deg); }
        }
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>üîç Duplicate Finder</h1>
            <p>TF-IDF + Cosine Similarity for Duplicate Item Detection</p>
        </div>
        
        <div class="content">
            <!-- Upload Dataset Section -->
            <div class="section">
                <h2 class="section-title">1. Upload Dataset</h2>
                <div class="upload-area">
                    <p style="margin-bottom: 15px; color: #666;">Upload your CSV dataset to train the duplicate detection model</p>
                    <input type="file" id="fileInput" accept=".csv">
                    <button class="upload-btn" onclick="document.getElementById('fileInput').click()">
                        üìÇ Choose CSV File
                    </button>
                    <p id="fileName" style="margin-top: 15px; color: #667eea; font-weight: 600;"></p>
                </div>
                <div class="dataset-info" id="datasetInfo"></div>
            </div>

            <!-- Column Selection Section -->
            <div class="section">
                <h2 class="section-title">2. Select Text Column</h2>
                <div class="input-group">
                    <label>Select the column to analyze for duplicates:</label>
                    <select id="columnSelect" disabled>
                        <option value="">-- Upload dataset first --</option>
                    </select>
                </div>
            </div>

            <!-- Check Item Section -->
            <div class="section">
                <h2 class="section-title">3. Check New Item</h2>
                <div class="input-group">
                    <label>Enter item description to check for duplicates:</label>
                    <textarea id="itemInput" placeholder="Enter item name or description..."></textarea>
                </div>
                <div class="input-group">
                    <label>Similarity Threshold (0.0 - 1.0):</label>
                    <input type="number" id="threshold" value="0.7" min="0" max="1" step="0.05">
                    <small style="color: #666;">Items with similarity above this threshold will be flagged as duplicates</small>
                </div>
                <button class="check-btn" id="checkBtn" onclick="checkDuplicate()" disabled>
                    üîç Check for Duplicates
                </button>
            </div>

            <!-- Loading -->
            <div class="loading" id="loading">
                <div class="spinner"></div>
                <p style="margin-top: 15px; color: #667eea;">Processing...</p>
            </div>

            <!-- Results Section -->
            <div class="results" id="results"></div>
        </div>
    </div>

    <script>
        let datasetUploaded = false;

        document.getElementById('fileInput').addEventListener('change', function(e) {
            const file = e.target.files[0];
            if (file) {
                document.getElementById('fileName').textContent = 'üìÑ ' + file.name;
                uploadDataset(file);
            }
        });

        function uploadDataset(file) {
            const formData = new FormData();
            formData.append('file', file);

            document.getElementById('loading').classList.add('show');

            fetch('/upload', {
                method: 'POST',
                body: formData
            })
            .then(response => response.json())
            .then(data => {
                document.getElementById('loading').classList.remove('show');
                
                if (data.success) {
                    datasetUploaded = true;
                    
                    // Update dataset info
                    const infoDiv = document.getElementById('datasetInfo');
                    infoDiv.innerHTML = `
                        <strong>‚úÖ Dataset loaded successfully!</strong><br>
                        üìä Total items: ${data.total_items}<br>
                        üìã Columns: ${data.columns.join(', ')}
                    `;
                    infoDiv.classList.add('show');

                    // Populate column selector
                    const select = document.getElementById('columnSelect');
                    select.innerHTML = '<option value="">-- Select a column --</option>';
                    data.columns.forEach(col => {
                        const option = document.createElement('option');
                        option.value = col;
                        option.textContent = col;
                        select.appendChild(option);
                    });
                    select.disabled = false;

                    // Enable check button when column is selected
                    select.addEventListener('change', function() {
                        if (this.value) {
                            document.getElementById('checkBtn').disabled = false;
                            trainModel(this.value);
                        }
                    });
                } else {
                    alert('Error uploading dataset: ' + data.error);
                }
            })
            .catch(error => {
                document.getElementById('loading').classList.remove('show');
                alert('Error uploading dataset: ' + error);
            });
        }

        function trainModel(column) {
            document.getElementById('loading').classList.add('show');

            fetch('/train', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json',
                },
                body: JSON.stringify({column: column})
            })
            .then(response => response.json())
            .then(data => {
                document.getElementById('loading').classList.remove('show');
                
                if (data.success) {
                    console.log('Model trained successfully');
                } else {
                    alert('Error training model: ' + data.error);
                }
            })
            .catch(error => {
                document.getElementById('loading').classList.remove('show');
                alert('Error training model: ' + error);
            });
        }

        function checkDuplicate() {
            const item = document.getElementById('itemInput').value.trim();
            const threshold = parseFloat(document.getElementById('threshold').value);
            const column = document.getElementById('columnSelect').value;

            if (!item) {
                alert('Please enter an item description');
                return;
            }

            if (!column) {
                alert('Please select a column');
                return;
            }

            document.getElementById('loading').classList.add('show');
            document.getElementById('results').classList.remove('show');

            fetch('/check', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json',
                },
                body: JSON.stringify({
                    item: item,
                    threshold: threshold,
                    column: column
                })
            })
            .then(response => response.json())
            .then(data => {
                document.getElementById('loading').classList.remove('show');
                displayResults(data);
            })
            .catch(error => {
                document.getElementById('loading').classList.remove('show');
                alert('Error checking duplicate: ' + error);
            });
        }

        function displayResults(data) {
            const resultsDiv = document.getElementById('results');
            
            if (data.is_duplicate) {
                let html = `
                    <div class="alert alert-warning">
                        ‚ö†Ô∏è DUPLICATE DETECTED! This item is similar to ${data.duplicates.length} existing item(s)
                    </div>
                    <h3 style="margin-bottom: 15px; color: #333;">Similar Items Found:</h3>
                `;

                data.duplicates.forEach((dup, index) => {
                    html += `
                        <div class="duplicate-item">
                            <strong>Match ${index + 1}</strong>
                            <span class="similarity-score">${(dup.similarity * 100).toFixed(1)}% Similar</span>
                            <p style="margin-top: 10px; color: #555;">${dup.item}</p>
                        </div>
                    `;
                });
            } else {
                html = `
                    <div class="alert alert-success">
                        ‚úÖ No duplicates found! This item is unique.
                    </div>
                `;
            }

            resultsDiv.innerHTML = html;
            resultsDiv.classList.add('show');
        }
    </script>
</body>
</html>
"""

@app.route('/')
def index():
    return render_template_string(HTML_TEMPLATE)

@app.route('/upload', methods=['POST'])
def upload_file():
    global dataset
    try:
        if 'file' not in request.files:
            return jsonify({'success': False, 'error': 'No file provided'}), 400
        
        file = request.files['file']
        if file.filename == '':
            return jsonify({'success': False, 'error': 'No file selected'}), 400
        
        # Read CSV file
        dataset = pd.read_csv(file)
        
        return jsonify({
            'success': True,
            'total_items': len(dataset),
            'columns': list(dataset.columns)
        })
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)}), 500

@app.route('/train', methods=['POST'])
def train():
    global dataset, tfidf_vectorizer, tfidf_matrix
    try:
        data = request.get_json()
        column = data.get('column')
        
        if dataset is None:
            return jsonify({'success': False, 'error': 'No dataset uploaded'}), 400
        
        if column not in dataset.columns:
            return jsonify({'success': False, 'error': 'Column not found'}), 400
        
        # Prepare text data
        text_data = dataset[column].fillna('').astype(str).tolist()
        
        # Train TF-IDF vectorizer
        tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
        tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)
        
        return jsonify({'success': True})
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)}), 500

@app.route('/check', methods=['POST'])
def check_duplicate():
    global dataset, tfidf_vectorizer, tfidf_matrix
    try:
        data = request.get_json()
        item = data.get('item')
        threshold = data.get('threshold', 0.7)
        column = data.get('column')
        
        if dataset is None or tfidf_vectorizer is None or tfidf_matrix is None:
            return jsonify({'success': False, 'error': 'Model not trained. Please upload dataset and train first.'}), 400
        
        # Vectorize the input item
        item_vector = tfidf_vectorizer.transform([item])
        
        # Calculate cosine similarity
        similarities = cosine_similarity(item_vector, tfidf_matrix)[0]
        
        # Find duplicates above threshold
        duplicate_indices = np.where(similarities >= threshold)[0]
        
        if len(duplicate_indices) > 0:
            duplicates = []
            for idx in duplicate_indices:
                duplicates.append({
                    'item': dataset.iloc[idx][column],
                    'similarity': float(similarities[idx])
                })
            
            # Sort by similarity (descending)
            duplicates.sort(key=lambda x: x['similarity'], reverse=True)
            
            return jsonify({
                'is_duplicate': True,
                'duplicates': duplicates
            })
        else:
            return jsonify({
                'is_duplicate': False,
                'duplicates': []
            })
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)

ModuleNotFoundError: No module named 'flask'

In [None]:
# Install Streamlit for the interactive app
%pip install streamlit plotly wordcloud


In [None]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import re

# Page configuration
st.set_page_config(
    page_title="Duplicate Finder with Interpretability",
    page_icon="üîç",
    layout="wide"
)

# Initialize session state
if 'dataset' not in st.session_state:
    st.session_state.dataset = None
if 'tfidf_vectorizer' not in st.session_state:
    st.session_state.tfidf_vectorizer = None
if 'tfidf_matrix' not in st.session_state:
    st.session_state.tfidf_matrix = None
if 'selected_column' not in st.session_state:
    st.session_state.selected_column = None
if 'text_data' not in st.session_state:
    st.session_state.text_data = None

st.title("üîç Duplicate Finder with Interpretability")
st.markdown("**TF-IDF + Cosine Similarity for Duplicate Detection**")

# Sidebar for configuration
with st.sidebar:
    st.header("‚öôÔ∏è Configuration")
    threshold = st.slider("Similarity Threshold", 0.0, 1.0, 0.7, 0.05)
    max_features = st.slider("Max TF-IDF Features", 100, 10000, 5000, 100)
    show_interpretability = st.checkbox("Show Interpretability", value=True)

# Main content area
col1, col2 = st.columns([1, 1])

with col1:
    st.header("üì• Input")
    
    # File upload
    uploaded_file = st.file_uploader("Upload CSV Dataset", type=['csv'])
    
    if uploaded_file is not None:
        try:
            st.session_state.dataset = pd.read_csv(uploaded_file)
            st.success(f"‚úÖ Dataset loaded: {len(st.session_state.dataset)} rows")
            
            # Column selection
            st.session_state.selected_column = st.selectbox(
                "Select Text Column to Analyze",
                options=st.session_state.dataset.columns.tolist()
            )
            
            if st.session_state.selected_column:
                # Train model button
                if st.button("üöÄ Train Model", type="primary"):
                    with st.spinner("Training TF-IDF model..."):
                        # Prepare text data
                        st.session_state.text_data = st.session_state.dataset[
                            st.session_state.selected_column
                        ].fillna('').astype(str).tolist()
                        
                        # Train TF-IDF
                        st.session_state.tfidf_vectorizer = TfidfVectorizer(
                            max_features=max_features,
                            stop_words='english'
                        )
                        st.session_state.tfidf_matrix = st.session_state.tfidf_vectorizer.fit_transform(
                            st.session_state.text_data
                        )
                        st.success("‚úÖ Model trained successfully!")
                        
                        # Show model info
                        st.info(f"üìä Vocabulary size: {len(st.session_state.tfidf_vectorizer.vocabulary_)}")
        except Exception as e:
            st.error(f"Error loading file: {str(e)}")
    
    # Item input
    st.subheader("üîé Check New Item")
    item_input = st.text_area(
        "Enter item description to check for duplicates:",
        height=100,
        placeholder="Enter item name or description..."
    )
    
    check_button = st.button("üîç Check for Duplicates", type="primary", disabled=st.session_state.tfidf_vectorizer is None)

with col2:
    st.header("üì§ Output")
    
    if check_button and item_input:
        if st.session_state.tfidf_vectorizer is None:
            st.warning("‚ö†Ô∏è Please train the model first!")
        else:
            with st.spinner("Analyzing..."):
                # Vectorize input
                item_vector = st.session_state.tfidf_vectorizer.transform([item_input])
                
                # Calculate similarity
                similarities = cosine_similarity(item_vector, st.session_state.tfidf_matrix)[0]
                
                # Find duplicates
                duplicate_indices = np.where(similarities >= threshold)[0]
                
                if len(duplicate_indices) > 0:
                    st.error(f"‚ö†Ô∏è **DUPLICATE DETECTED!** Found {len(duplicate_indices)} similar item(s)")
                    
                    # Create results dataframe
                    results = []
                    for idx in duplicate_indices:
                        results.append({
                            'Item': st.session_state.dataset.iloc[idx][st.session_state.selected_column],
                            'Similarity Score': f"{similarities[idx]:.3f}",
                            'Similarity %': f"{similarities[idx]*100:.1f}%"
                        })
                    
                    results_df = pd.DataFrame(results)
                    results_df = results_df.sort_values('Similarity Score', ascending=False)
                    st.dataframe(results_df, use_container_width=True)
                    
                    # Store for interpretability
                    st.session_state.duplicate_results = {
                        'item': item_input,
                        'duplicates': [
                            {
                                'index': int(idx),
                                'item': st.session_state.dataset.iloc[idx][st.session_state.selected_column],
                                'similarity': float(similarities[idx])
                            }
                            for idx in duplicate_indices
                        ],
                        'similarities': similarities
                    }
                else:
                    st.success("‚úÖ **No duplicates found!** This item is unique.")
                    st.session_state.duplicate_results = None

# Interpretability Section
if show_interpretability and 'duplicate_results' in st.session_state and st.session_state.duplicate_results:
    st.header("üî¨ Interpretability")
    
    results = st.session_state.duplicate_results
    
    # Tabs for different interpretability views
    tab1, tab2, tab3, tab4 = st.tabs([
        "üìä Similarity Distribution",
        "üî§ Feature Importance",
        "üìà Top Matches Analysis",
        "üí¨ Word Comparison"
    ])
    
    with tab1:
        st.subheader("Similarity Score Distribution")
        
        # Create histogram
        fig = px.histogram(
            x=results['similarities'],
            nbins=50,
            labels={'x': 'Cosine Similarity Score', 'y': 'Count'},
            title="Distribution of Similarity Scores"
        )
        fig.add_vline(
            x=threshold,
            line_dash="dash",
            line_color="red",
            annotation_text=f"Threshold: {threshold}"
        )
        st.plotly_chart(fig, use_container_width=True)
        
        # Statistics
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("Mean Similarity", f"{np.mean(results['similarities']):.3f}")
        with col2:
            st.metric("Max Similarity", f"{np.max(results['similarities']):.3f}")
        with col3:
            st.metric("Min Similarity", f"{np.min(results['similarities']):.3f}")
        with col4:
            st.metric("Above Threshold", f"{len(results['duplicates'])}")
    
    with tab2:
        st.subheader("TF-IDF Feature Importance")
        
        if len(results['duplicates']) > 0:
            # Get top match
            top_match = results['duplicates'][0]
            top_idx = top_match['index']
            
            # Get feature names
            feature_names = st.session_state.tfidf_vectorizer.get_feature_names_out()
            
            # Get TF-IDF vectors
            input_vector = st.session_state.tfidf_vectorizer.transform([results['item']])
            match_vector = st.session_state.tfidf_matrix[top_idx]
            
            # Convert to arrays
            input_array = input_vector.toarray()[0]
            match_array = match_vector.toarray()[0]
            
            # Get top contributing features
            # Use product of both vectors to find important shared features
            shared_features = input_array * match_array
            top_feature_indices = np.argsort(shared_features)[-20:][::-1]
            
            # Create feature importance dataframe
            feature_importance = []
            for idx in top_feature_indices:
                if shared_features[idx] > 0:
                    feature_importance.append({
                        'Feature': feature_names[idx],
                        'Input TF-IDF': f"{input_array[idx]:.4f}",
                        'Match TF-IDF': f"{match_array[idx]:.4f}",
                        'Shared Importance': f"{shared_features[idx]:.4f}"
                    })
            
            if feature_importance:
                importance_df = pd.DataFrame(feature_importance)
                st.dataframe(importance_df, use_container_width=True)
                
                # Bar chart
                fig = px.bar(
                    importance_df.head(10),
                    x='Feature',
                    y='Shared Importance',
                    title="Top 10 Most Important Shared Features",
                    labels={'Shared Importance': 'TF-IDF Product'}
                )
                fig.update_xaxes(tickangle=45)
                st.plotly_chart(fig, use_container_width=True)
    
    with tab3:
        st.subheader("Top Matches Detailed Analysis")
        
        # Sort duplicates by similarity
        sorted_duplicates = sorted(results['duplicates'], key=lambda x: x['similarity'], reverse=True)
        
        for i, dup in enumerate(sorted_duplicates[:5], 1):
            with st.expander(f"Match #{i}: Similarity = {dup['similarity']:.3f} ({dup['similarity']*100:.1f}%)"):
                col1, col2 = st.columns(2)
                
                with col1:
                    st.write("**Input Item:**")
                    st.write(results['item'])
                
                with col2:
                    st.write("**Matched Item:**")
                    st.write(dup['item'])
                
                # Show similarity score
                st.progress(dup['similarity'])
                st.caption(f"Similarity: {dup['similarity']:.3f}")
    
    with tab4:
        st.subheader("Word-Level Comparison")
        
        if len(results['duplicates']) > 0:
            top_match = results['duplicates'][0]
            
            # Extract words
            def extract_words(text):
                words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
                return words
            
            input_words = Counter(extract_words(results['item']))
            match_words = Counter(extract_words(top_match['item']))
            
            # Find common words
            common_words = set(input_words.keys()) & set(match_words.keys())
            
            col1, col2, col3 = st.columns(3)
            
            with col1:
                st.write("**Input Words:**")
                st.write(list(input_words.keys())[:20])
                st.metric("Unique Words", len(input_words))
            
            with col2:
                st.write("**Match Words:**")
                st.write(list(match_words.keys())[:20])
                st.metric("Unique Words", len(match_words))
            
            with col3:
                st.write("**Common Words:**")
                st.write(list(common_words)[:20])
                st.metric("Common Words", len(common_words))
            
            # Word frequency comparison
            if common_words:
                common_word_freq = []
                for word in list(common_words)[:15]:
                    common_word_freq.append({
                        'Word': word,
                        'Input Count': input_words[word],
                        'Match Count': match_words[word]
                    })
                
                freq_df = pd.DataFrame(common_word_freq)
                
                fig = go.Figure()
                fig.add_trace(go.Bar(
                    x=freq_df['Word'],
                    y=freq_df['Input Count'],
                    name='Input',
                    marker_color='lightblue'
                ))
                fig.add_trace(go.Bar(
                    x=freq_df['Word'],
                    y=freq_df['Match Count'],
                    name='Match',
                    marker_color='lightcoral'
                ))
                fig.update_layout(
                    title="Common Word Frequency Comparison",
                    xaxis_title="Word",
                    yaxis_title="Frequency",
                    barmode='group'
                )
                fig.update_xaxes(tickangle=45)
                st.plotly_chart(fig, use_container_width=True)

# Footer
st.markdown("---")
st.markdown("**How it works:**")
st.markdown("""
1. **TF-IDF Vectorization**: Converts text into numerical vectors based on term frequency and inverse document frequency
2. **Cosine Similarity**: Measures the angle between vectors to determine similarity (0 = different, 1 = identical)
3. **Threshold-based Detection**: Items with similarity above the threshold are flagged as duplicates
4. **Interpretability**: Shows which features (words) contribute most to the similarity score
""")


# How to Run the Applications

## Option 1: Streamlit App (Recommended - Interactive UI)

To run the Streamlit app, use one of these methods:

### Method A: From Terminal
```bash
streamlit run streamlit_app.py
```

### Method B: From Notebook
Run the cell above (Cell 4) which contains the Streamlit code, then in a new terminal run:
```bash
streamlit run streamlit_app.py
```

## Option 2: CLI Application

Run from terminal:
```bash
python cli_app.py <csv_file> <column_name> <item_to_check> [--threshold 0.7]
```

Example:
```bash
python cli_app.py products.csv product_name "iPhone 13 Pro" --threshold 0.8
```

## Features

### Input ‚Üí Output ‚Üí Interpretability Flow:

1. **Input**: Upload CSV, select column, enter item to check
2. **Output**: Duplicate detection results with similarity scores
3. **Interpretability**: 
   - Similarity distribution histogram
   - TF-IDF feature importance
   - Top matches detailed analysis
   - Word-level comparison


In [6]:
streamlit run streamlit_app.py

SyntaxError: invalid syntax (1817081337.py, line 1)

In [5]:
from gettext import install


pip install Flask

SyntaxError: invalid syntax (3015557580.py, line 4)