In [None]:
!pip install python-docx --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
from openai import OpenAI
import json
from typing import List, Dict
import os
from collections import Counter
import time

In [None]:
os.environ["DEEPSEEK_API_KEY"] = "sk-af880d796c074ee9853bf9957798a46d"

In [None]:
class DiscourseAnalyzer:
    """Analyzes academic abstracts to identify research discourses"""

    def __init__(self, api_key: str = None):
        """
        Initialize the analyzer with DeepSeek API

        Args:
            api_key: DeepSeek API key (if None, reads from DEEPSEEK_API_KEY env variable)
        """
        self.api_key = api_key or os.environ.get('DEEPSEEK_API_KEY')
        if not self.api_key:
            raise ValueError("API key must be provided or set in DEEPSEEK_API_KEY environment variable")

        self.client = OpenAI(
            api_key=self.api_key,
            base_url="https://api.deepseek.com"
        )
        self.model = "deepseek-chat"

    def analyze_single_abstract(self, abstract: str, title: str = "") -> Dict:
        """
        Analyze a single abstract to identify its discourse themes

        Args:
            abstract: The abstract text
            title: Optional article title for context

        Returns:
            Dictionary with discourse analysis results
        """
        prompt = f"""Analyze the following academic article abstract from the Journal of Business Venturing.

Title: {title}
Abstract: {abstract}

Please identify and categorize the key research discourse(s) in this abstract. Provide your analysis in JSON format with the following structure:

{{
    "primary_discourse": "Main research discourse/theme",
    "secondary_discourses": ["Additional discourse 1", "Additional discourse 2"],
    "theoretical_perspectives": ["Theoretical lens 1", "Theoretical lens 2"],
    "research_context": "Brief description of the research setting/context",
    "key_concepts": ["Concept 1", "Concept 2", "Concept 3"],
    "methodological_approach": "Research methodology type"
}}

Be specific and use established academic terminology. Focus on identifying:
- The main theoretical conversation this research contributes to
- Key concepts and constructs being examined
- The broader academic discourse(s) this work engages with

Respond ONLY with the JSON object, no additional text."""

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,
                max_tokens=2000
            )

            # Extract the response text
            response_text = response.choices[0].message.content

            # Try to parse JSON from the response
            # Remove markdown code blocks if present
            if "```json" in response_text:
                response_text = response_text.split("```json")[1].split("```")[0].strip()
            elif "```" in response_text:
                response_text = response_text.split("```")[1].split("```")[0].strip()

            result = json.loads(response_text)
            return result

        except Exception as e:
            print(f"Error analyzing abstract: {e}")
            return None

    def analyze_batch(self, df: pd.DataFrame, abstract_column: str = 'abstract',
                     title_column: str = 'title', sample_size: int = None,
                     delay: float = 1.0) -> pd.DataFrame:
        """
        Analyze a batch of abstracts

        Args:
            df: DataFrame containing the articles
            abstract_column: Name of the column containing abstracts
            title_column: Name of the column containing titles
            sample_size: Optional number of articles to analyze (None = all)
            delay: Delay in seconds between API calls to avoid rate limits

        Returns:
            DataFrame with added analysis columns
        """
        # Work with a copy
        df_analysis = df.copy()

        # Sample if requested
        if sample_size and sample_size < len(df_analysis):
            df_analysis = df_analysis.sample(n=sample_size, random_state=42)

        # Initialize new columns
        df_analysis['primary_discourse'] = None
        df_analysis['secondary_discourses'] = None
        df_analysis['theoretical_perspectives'] = None
        df_analysis['research_context'] = None
        df_analysis['key_concepts'] = None
        df_analysis['methodological_approach'] = None

        total = len(df_analysis)

        for idx, row in df_analysis.iterrows():
            print(f"Analyzing article {idx + 1}/{total}...")

            abstract = row[abstract_column]
            title = row[title_column] if title_column in df_analysis.columns else ""

            # Skip if abstract is missing
            if pd.isna(abstract) or not abstract:
                print(f"  Skipping - no abstract")
                continue

            result = self.analyze_single_abstract(abstract, title)

            if result:
                df_analysis.at[idx, 'primary_discourse'] = result.get('primary_discourse', '')
                df_analysis.at[idx, 'secondary_discourses'] = json.dumps(result.get('secondary_discourses', []))
                df_analysis.at[idx, 'theoretical_perspectives'] = json.dumps(result.get('theoretical_perspectives', []))
                df_analysis.at[idx, 'research_context'] = result.get('research_context', '')
                df_analysis.at[idx, 'key_concepts'] = json.dumps(result.get('key_concepts', []))
                df_analysis.at[idx, 'methodological_approach'] = result.get('methodological_approach', '')

            # Rate limiting
            time.sleep(delay)

        return df_analysis

    def generate_comprehensive_overview(self, df_analyzed: pd.DataFrame) -> str:
        """
        Generate a comprehensive overview of all discourses identified

        Args:
            df_analyzed: DataFrame with analysis results

        Returns:
            Comprehensive overview text
        """
        # Collect all discourses
        all_primary = df_analyzed['primary_discourse'].dropna().tolist()

        all_secondary = []
        for item in df_analyzed['secondary_discourses'].dropna():
            try:
                all_secondary.extend(json.loads(item))
            except:
                pass

        all_theoretical = []
        for item in df_analyzed['theoretical_perspectives'].dropna():
            try:
                all_theoretical.extend(json.loads(item))
            except:
                pass

        all_concepts = []
        for item in df_analyzed['key_concepts'].dropna():
            try:
                all_concepts.extend(json.loads(item))
            except:
                pass

        all_methods = df_analyzed['methodological_approach'].dropna().tolist()

        # Create summary prompt
        summary_data = {
            "primary_discourses": all_primary,
            "secondary_discourses": all_secondary,
            "theoretical_perspectives": all_theoretical,
            "key_concepts": all_concepts,
            "methodological_approaches": all_methods,
            "total_articles": len(df_analyzed)
        }

        prompt = f"""Based on the analysis of {len(df_analyzed)} articles from the Journal of Business Venturing (2023-present),
I have identified the following research elements:

Primary Discourses (main themes):
{Counter(all_primary).most_common(20)}

Secondary Discourses:
{Counter(all_secondary).most_common(20)}

Theoretical Perspectives:
{Counter(all_theoretical).most_common(20)}

Key Concepts:
{Counter(all_concepts).most_common(30)}

Methodological Approaches:
{Counter(all_methods).most_common(15)}

Please provide a comprehensive overview (1000-1500 words) of the research discourses being discussed in the Journal of Business Venturing.
Your overview should:

1. Identify the major thematic clusters and research streams
2. Discuss how these discourses relate to each other
3. Highlight any emerging trends or shifts in focus
4. Note the dominant theoretical perspectives being employed
5. Discuss the methodological diversity (or lack thereof)
6. Provide insights into the current state and future directions of business venturing research

Write this as an academic overview suitable for a literature review or research proposal."""

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=0.5,
                max_tokens=4000
            )

            return response.choices[0].message.content

        except Exception as e:
            print(f"Error generating overview: {e}")
            return None

In [None]:
def main():
    """Main execution function"""

    # Configuration
    CSV_FILE = "jbv_papers.csv"  # Replace with your CSV filename
    ABSTRACT_COLUMN = "Abstract"  # Replace with your abstract column name
    TITLE_COLUMN = "title"  # Replace with your title column name
    SAMPLE_SIZE = None  # Set to a number to analyze only a sample, or None for all
    OUTPUT_FILE = "discourse_analysis_results.csv"
    OVERVIEW_FILE = "discourse_overview.txt"

    print("=== Journal Article Discourse Analysis ===\n")

    # Initialize analyzer
    print("Initializing analyzer...")
    analyzer = DiscourseAnalyzer()

    # Load data
    print(f"Loading data from {CSV_FILE}...")
    df = pd.read_csv(CSV_FILE)
    print(f"Loaded {len(df)} articles\n")

    # Display column names to help user verify
    print("Available columns:", df.columns.tolist())
    print()

    # Analyze articles
    print("Starting analysis...")
    df_results = analyzer.analyze_batch(
        df,
        abstract_column=ABSTRACT_COLUMN,
        title_column=TITLE_COLUMN,
        sample_size=SAMPLE_SIZE,
        delay=1.0  # Adjust delay as needed for rate limits
    )

    # Save detailed results
    print(f"\nSaving detailed results to {OUTPUT_FILE}...")
    df_results.to_csv(OUTPUT_FILE, index=False)

    # Generate comprehensive overview
    print("Generating comprehensive overview...")
    overview = analyzer.generate_comprehensive_overview(df_results)

    if overview:
        print(f"Saving overview to {OVERVIEW_FILE}...")
        with open(OVERVIEW_FILE, 'w', encoding='utf-8') as f:
            f.write(overview)

        print("\n" + "="*70)
        print("COMPREHENSIVE OVERVIEW")
        print("="*70 + "\n")
        print(overview)

    print("\n=== Analysis Complete ===")
    print(f"Detailed results saved to: {OUTPUT_FILE}")
    print(f"Overview saved to: {OVERVIEW_FILE}")


if __name__ == "__main__":
    main()

=== Journal Article Discourse Analysis ===

Initializing analyzer...
Loading data from jbv_papers.csv...
Loaded 382 articles

Available columns: ['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year', 'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end', 'Cited by', 'DOI', 'Link', 'Affiliations', 'Authors with affiliations', 'Abstract', 'Author Keywords', 'Document Type', 'Source']

Starting analysis...
Analyzing article 1/382...
Analyzing article 2/382...
Analyzing article 3/382...
Analyzing article 4/382...
Analyzing article 5/382...
Analyzing article 6/382...
Analyzing article 7/382...
Analyzing article 8/382...
Analyzing article 9/382...
Analyzing article 10/382...
Analyzing article 11/382...
Analyzing article 12/382...
Analyzing article 13/382...
Analyzing article 14/382...
Analyzing article 15/382...
Analyzing article 16/382...
Analyzing article 17/382...
Analyzing article 18/382...
Analyzing article 19/382...
Analyzing article 20/382...
Analyzing art