In [1]:
import nltk
import spacy
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.util import ngrams
import re
import warnings

warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
    
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')
    
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
    
try:
    nltk.data.find('chunkers/maxent_ne_chunker')
except LookupError:
    nltk.download('maxent_ne_chunker')
    
try:
    nltk.data.find('corpora/words')
except LookupError:
    nltk.download('words')

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Please install spaCy English model: python -m spacy download en_core_web_sm")
    nlp = None



[nltk_data] Downloading package wordnet to /home/pat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:

class LinguisticAnalyzer:
    """Main class for analyzing linguistic evolution across novels."""
    
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.vader = SentimentIntensityAnalyzer()
        self.stop_words = set(stopwords.words('english'))
        
        # Book metadata
        self.books_data = {
            'Tristram Shandy': {'year': 1759, 'themes': ['psychological', 'fragmented', 'narrative']},
            'The Scarlet Letter': {'year': 1850, 'themes': ['religious', 'moral', 'puritan']},
            'Sister Carrie': {'year': 1900, 'themes': ['urban', 'industrial', 'naturalism']},
            'The Martian Chronicles': {'year': 1950, 'themes': ['futuristic', 'colonial', 'technology']},
            'White Teeth': {'year': 2000, 'themes': ['multicultural', 'identity', 'modern']},
            'The Vanishing Half': {'year': 2020, 'themes': ['race', 'identity', 'contemporary']}
        }
        
        # Target keywords for analysis
        self.target_keywords = [
            'name', 'race', 'self', 'color', 'double', 'sin', 'shame', 'virtue', 'repent',
            'rocket', 'earth', 'mars', 'colonize', 'soil', 'vanish', 'gone', 'half', 
            'missing', 'woman', 'freedom'
        ]
        
        self.analyzed_books = {}
        self.tfidf_matrix = None
        self.feature_names = None
        
    def preprocess_text(self, text):
        """Comprehensive text preprocessing."""
        # Clean text
        text = re.sub(r'[^\w\s]', ' ', text.lower())
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords and short words
        tokens = [token for token in tokens if token not in self.stop_words and len(token) > 2]
        
        return tokens
    
    def extract_linguistic_features(self, text, book_title):
        """Extract comprehensive linguistic features from text."""
        features = {
            'title': book_title,
            'year': self.books_data[book_title]['year'],
            'word_tokens': [],
            'named_entities': [],
            'pos_tags': [],
            'detailed_pos': [],
            'lemmas': [],
            'stems': [],
            'bigrams': [],
            'trigrams': [],
            'tense_analysis': {},
            'morphological_table': []
        }
        
        # Preprocess
        tokens = self.preprocess_text(text)
        features['word_tokens'] = tokens
        
        # POS tagging
        pos_tags = pos_tag(tokens)
        features['pos_tags'] = pos_tags
        
        # Detailed linguistic analysis with spaCy
        if nlp:
            doc = nlp(text[:1000000])  # Limit for memory efficiency
            for token in doc:
                if not token.is_stop and not token.is_punct and len(token.text) > 2:
                    features['detailed_pos'].append({
                        'text': token.text.lower(),
                        'lemma': token.lemma_,
                        'pos': token.pos_,
                        'tag': token.tag_,
                        'dep': token.dep_,
                        'is_alpha': token.is_alpha
                    })
            
            # Named entities
            features['named_entities'] = [(ent.text, ent.label_) for ent in doc.ents]
        
        # Lemmatization and stemming
        for word, pos in pos_tags:
            lemma = self.lemmatizer.lemmatize(word.lower())
            stem = self.stemmer.stem(word.lower())
            features['lemmas'].append(lemma)
            features['stems'].append(stem)
            
            # Create morphological table entry
            features['morphological_table'].append({
                'word': word.lower(),
                'lemma': lemma,
                'stem': stem,
                'pos': pos,
                'morphemes': self._analyze_morphemes(word)
            })
        
        # N-grams
        features['bigrams'] = list(ngrams(tokens, 2))
        features['trigrams'] = list(ngrams(tokens, 3))
        
        # Tense analysis
        features['tense_analysis'] = self._analyze_tense(pos_tags)
        
        return features
    
    def _analyze_morphemes(self, word):
        """Simple morpheme analysis."""
        morphemes = []
        if word.endswith('ing'):
            morphemes.append('progressive')
        if word.endswith('ed'):
            morphemes.append('past')
        if word.endswith('s') and not word.endswith('ss'):
            morphemes.append('plural/3rd_person')
        return morphemes
    
    def _analyze_tense(self, pos_tags):
        """Analyze tense distribution."""
        tense_counts = defaultdict(int)
        for word, pos in pos_tags:
            if pos.startswith('VB'):
                if pos == 'VBD':
                    tense_counts['past'] += 1
                elif pos == 'VBG':
                    tense_counts['present_progressive'] += 1
                elif pos == 'VBN':
                    tense_counts['past_participle'] += 1
                elif pos == 'VBP' or pos == 'VBZ':
                    tense_counts['present'] += 1
                else:
                    tense_counts['other'] += 1
        return dict(tense_counts)
    
    def sentiment_analysis(self, text, method='vader'):
        """Perform sentiment analysis using VADER or Naive Bayes."""
        if method == 'vader':
            scores = self.vader.polarity_scores(text)
            return {
                'compound': scores['compound'],
                'positive': scores['pos'],
                'negative': scores['neg'],
                'neutral': scores['neu']
            }
        # Note: For Naive Bayes, would need labeled training data
        return None
    
    def calculate_keyword_frequency(self, features, keywords=None):
        """Calculate frequency of specific keywords."""
        if keywords is None:
            keywords = self.target_keywords
            
        keyword_freq = {}
        tokens = features['word_tokens']
        lemmas = features['lemmas']
        
        for keyword in keywords:
            # Count in both tokens and lemmas
            token_count = tokens.count(keyword)
            lemma_count = lemmas.count(keyword)
            total_count = max(token_count, lemma_count)  # Avoid double counting
            
            keyword_freq[keyword] = {
                'frequency': total_count,
                'relative_frequency': total_count / len(tokens) if tokens else 0,
                'contexts': self._extract_contexts(tokens, keyword)
            }
        
        return keyword_freq
    
    def _extract_contexts(self, tokens, keyword, window=5):
        """Extract contexts around keyword occurrences."""
        contexts = []
        for i, token in enumerate(tokens):
            if token == keyword:
                start = max(0, i - window)
                end = min(len(tokens), i + window + 1)
                context = ' '.join(tokens[start:end])
                contexts.append(context)
        return contexts[:5]  # Limit contexts for memory
    
    def calculate_tfidf(self, book_texts):
        """Calculate TF-IDF scores across all books."""
        vectorizer = TfidfVectorizer(
            max_features=5000,
            stop_words='english',
            lowercase=True,
            ngram_range=(1, 2)
        )
        
        texts = [text for text in book_texts.values()]
        self.tfidf_matrix = vectorizer.fit_transform(texts)
        self.feature_names = vectorizer.get_feature_names_out()
        
        # Create TF-IDF DataFrame
        tfidf_df = pd.DataFrame(
            self.tfidf_matrix.toarray(),
            index=list(book_texts.keys()),
            columns=self.feature_names
        )
        
        return tfidf_df
    
    def calculate_inverse_term_frequency(self, tfidf_df):
        """Calculate upward and downward ITF."""
        itf_results = {}
        
        for book in tfidf_df.index:
            book_scores = tfidf_df.loc[book]
            other_books_mean = tfidf_df.drop(book).mean()
            
            # Upward ITF: words more frequent in this book
            upward_itf = book_scores - other_books_mean
            
            # Downward ITF: words less frequent in this book
            downward_itf = other_books_mean - book_scores
            
            itf_results[book] = {
                'upward': upward_itf.nlargest(20).to_dict(),
                'downward': downward_itf.nlargest(20).to_dict()
            }
        
        return itf_results
    
    def analyze_book(self, text, book_title):
        """Complete analysis of a single book."""
        print(f"Analyzing {book_title}...")
        
        # Extract linguistic features
        features = self.extract_linguistic_features(text, book_title)
        
        # Sentiment analysis
        sentiment = self.sentiment_analysis(text)
        
        # Keyword frequency analysis
        keyword_freq = self.calculate_keyword_frequency(features)
        
        # Compile results
        analysis = {
            'features': features,
            'sentiment': sentiment,
            'keyword_frequencies': keyword_freq,
            'year': self.books_data[book_title]['year'],
            'themes': self.books_data[book_title]['themes']
        }
        
        self.analyzed_books[book_title] = analysis
        return analysis
    
    def create_evolution_table(self):
        """Create table showing keyword evolution across time."""
        evolution_data = []
        
        for keyword in self.target_keywords:
            for book_title, analysis in self.analyzed_books.items():
                freq_data = analysis['keyword_frequencies'].get(keyword, {})
                
                evolution_data.append({
                    'keyword': keyword,
                    'book': book_title,
                    'year': analysis['year'],
                    'frequency': freq_data.get('frequency', 0),
                    'relative_frequency': freq_data.get('relative_frequency', 0),
                    'contexts': len(freq_data.get('contexts', [])),
                    'sentiment': analysis['sentiment']['compound'] if analysis['sentiment'] else 0
                })
        
        return pd.DataFrame(evolution_data)
    
    def visualize_keyword_evolution(self, keyword, save_fig=False):
        """Create visualizations for keyword evolution."""
        evolution_df = self.create_evolution_table()
        keyword_data = evolution_df[evolution_df['keyword'] == keyword].sort_values('year')
        
        if keyword_data.empty:
            print(f"No data found for keyword: {keyword}")
            return
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # Line graph
        ax1.plot(keyword_data['year'], keyword_data['relative_frequency'], 
                marker='o', linewidth=2, markersize=8)
        ax1.set_title(f'Evolution of "{keyword}" - Relative Frequency Over Time')
        ax1.set_xlabel('Year')
        ax1.set_ylabel('Relative Frequency')
        ax1.grid(True, alpha=0.3)
        
        # Bar graph
        ax2.bar(keyword_data['year'], keyword_data['frequency'], 
               color='skyblue', alpha=0.7)
        ax2.set_title(f'Evolution of "{keyword}" - Absolute Frequency')
        ax2.set_xlabel('Year')
        ax2.set_ylabel('Absolute Frequency')
        
        plt.tight_layout()
        
        if save_fig:
            plt.savefig(f'{keyword}_evolution.png', dpi=300, bbox_inches='tight')
        
        plt.show()
        
        # Print semantic summary
        self._print_keyword_summary(keyword, keyword_data)
    
    def _print_keyword_summary(self, keyword, keyword_data):
        """Print semantic and syntactic summary for keyword."""
        print(f"\n=== SUMMARY FOR '{keyword.upper()}' ===")
        print(f"Time span: {keyword_data['year'].min()} - {keyword_data['year'].max()}")
        print(f"Peak usage: {keyword_data.loc[keyword_data['frequency'].idxmax(), 'book']} "
              f"({keyword_data['frequency'].max()} occurrences)")
        print(f"Average relative frequency: {keyword_data['relative_frequency'].mean():.6f}")
        
        # Context analysis
        for _, row in keyword_data.iterrows():
            if row['frequency'] > 0:
                analysis = self.analyzed_books[row['book']]
                contexts = analysis['keyword_frequencies'][keyword]['contexts']
                if contexts:
                    print(f"\n{row['book']} ({row['year']}) - Sample context:")
                    print(f"  '{contexts[0]}'")
    
    def interactive_word_analyzer(self):
        """Interactive function for user to analyze any word."""
        def analyze_word(word):
            word = word.lower().strip()
            print(f"\nAnalyzing word: '{word}'")
            
            # Check if word exists in any book
            word_data = []
            for book_title, analysis in self.analyzed_books.items():
                tokens = analysis['features']['word_tokens']
                lemmas = analysis['features']['lemmas']
                
                token_count = tokens.count(word)
                lemma_count = lemmas.count(word)
                total_count = max(token_count, lemma_count)
                
                if total_count > 0:
                    word_data.append({
                        'book': book_title,
                        'year': analysis['year'],
                        'frequency': total_count,
                        'relative_frequency': total_count / len(tokens) if tokens else 0
                    })
            
            if not word_data:
                print(f"Word '{word}' not found in any of the analyzed books.")
                return
            
            # Create visualization
            word_df = pd.DataFrame(word_data).sort_values('year')
            
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
            
            # Line graph
            ax1.plot(word_df['year'], word_df['relative_frequency'], 
                    marker='o', linewidth=2, markersize=8, color='red')
            ax1.set_title(f'Evolution of "{word}" - Relative Frequency')
            ax1.set_xlabel('Year')
            ax1.set_ylabel('Relative Frequency')
            ax1.grid(True, alpha=0.3)
            
            # Bar graph
            ax2.bar(word_df['year'], word_df['frequency'], 
                   color='lightcoral', alpha=0.7)
            ax2.set_title(f'Evolution of "{word}" - Absolute Frequency')
            ax2.set_xlabel('Year')
            ax2.set_ylabel('Absolute Frequency')
            
            plt.tight_layout()
            plt.show()
            
            # Print summary
            print(f"\n=== ANALYSIS SUMMARY FOR '{word.upper()}' ===")
            print(f"Found in {len(word_data)} books")
            print(f"Time span: {word_df['year'].min()} - {word_df['year'].max()}")
            print(f"Peak usage: {word_df.loc[word_df['frequency'].idxmax(), 'book']} "
                  f"({word_df['frequency'].max()} occurrences)")
            
            return word_df
        
        return analyze_word
    
    def generate_comprehensive_report(self):
        """Generate a comprehensive analysis report."""
        print("=== COMPREHENSIVE LINGUISTIC EVOLUTION REPORT ===\n")
        
        # Time-ordered analysis
        sorted_books = sorted(self.analyzed_books.items(), 
                            key=lambda x: x[1]['year'])
        
        print("1. CHRONOLOGICAL OVERVIEW:")
        for book_title, analysis in sorted_books:
            print(f"\n{book_title} ({analysis['year']}):")
            print(f"  Themes: {', '.join(analysis['themes'])}")
            print(f"  Sentiment: {analysis['sentiment']['compound']:.3f}")
            print(f"  Total tokens: {len(analysis['features']['word_tokens'])}")
        
        print("\n2. KEYWORD EVOLUTION TRENDS:")
        evolution_df = self.create_evolution_table()
        
        # Find most evolving keywords
        keyword_variance = evolution_df.groupby('keyword')['relative_frequency'].var().sort_values(ascending=False)
        print("\nMost variable keywords across time:")
        for keyword, variance in keyword_variance.head(10).items():
            print(f"  {keyword}: {variance:.8f}")
        
        print("\n3. CULTURAL SHIFT INDICATORS:")
        
        # Technology vs Nature
        tech_words = ['rocket', 'mars', 'colonize']
        nature_words = ['earth', 'soil']
        
        tech_evolution = evolution_df[evolution_df['keyword'].isin(tech_words)].groupby('year')['relative_frequency'].sum()
        nature_evolution = evolution_df[evolution_df['keyword'].isin(nature_words)].groupby('year')['relative_frequency'].sum()
        
        print("\nTechnology vs Nature theme evolution:")
        for year in sorted(tech_evolution.index):
            tech_freq = tech_evolution.get(year, 0)
            nature_freq = nature_evolution.get(year, 0)
            print(f"  {year}: Tech={tech_freq:.6f}, Nature={nature_freq:.6f}")
        
        # Identity and race
        identity_words = ['name', 'race', 'self', 'color', 'double']
        identity_evolution = evolution_df[evolution_df['keyword'].isin(identity_words)].groupby('year')['relative_frequency'].sum()
        
        print("\nIdentity theme evolution:")
        for year in sorted(identity_evolution.index):
            print(f"  {year}: {identity_evolution.get(year, 0):.6f}")
        
        return evolution_df


In [3]:
# Example usage and demonstration
def main():
    """Main function demonstrating the analysis system."""
    
    # Initialize analyzer
    analyzer = LinguisticAnalyzer()
    
    print("Literary Language Evolution Analysis System")
    print("=========================================")
    print("\nThis system analyzes linguistic evolution across six novels:")
    for book, data in analyzer.books_data.items():
        print(f"- {book} ({data['year']})")
    
    print(f"\nTarget keywords: {', '.join(analyzer.target_keywords)}")
    
    # Note: In practice, you would load actual book texts here
    print("\n[DEMO MODE - In practice, load actual book texts using:]")
    print("with open('book.txt', 'r', encoding='utf-8') as f:")
    print("    text = f.read()")
    print("    analyzer.analyze_book(text, 'Book Title')")
    
    # Create sample data for demonstration
    sample_texts = {
        'Tristram Shandy': "The name of the self is double in nature, as is the color of shame...",
        'The Scarlet Letter': "Sin and virtue repent in equal measure, the name bearing shame...",
        'Sister Carrie': "The woman sought freedom in the city, her name gone from home...",
        'The Martian Chronicles': "The rocket to Mars would colonize the red soil of Earth's neighbor...",
        'White Teeth': "Identity and race color the modern self, names carrying double meaning...",
        'The Vanishing Half': "Half the woman would vanish, her race and color gone missing..."
    }
    
    # Analyze sample texts
    for title, text in sample_texts.items():
        analyzer.analyze_book(text, title)
    
    # Generate evolution table and TF-IDF
    evolution_df = analyzer.create_evolution_table()
    tfidf_df = analyzer.calculate_tfidf(sample_texts)
    itf_results = analyzer.calculate_inverse_term_frequency(tfidf_df)
    
    print("\n=== SAMPLE ANALYSIS RESULTS ===")
    print("\nEvolution table (first 10 rows):")
    print(evolution_df.head(10))
    
    print("\n=== INTERACTIVE FEATURES ===")
    print("1. Use analyzer.visualize_keyword_evolution('keyword') for any target keyword")
    print("2. Use word_analyzer = analyzer.interactive_word_analyzer()")
    print("   Then: word_analyzer('your_word') to analyze any word")
    print("3. Use analyzer.generate_comprehensive_report() for full analysis")
    
    # Demonstrate interactive analyzer
    word_analyzer = analyzer.interactive_word_analyzer()
    
    print("\n=== SYSTEM READY ===")
    print("The system is now ready for full-scale analysis with actual book texts.")
    
    return analyzer, word_analyzer


In [4]:
if __name__ == "__main__":
    analyzer, word_analyzer = main()

Literary Language Evolution Analysis System

This system analyzes linguistic evolution across six novels:
- Tristram Shandy (1759)
- The Scarlet Letter (1850)
- Sister Carrie (1900)
- The Martian Chronicles (1950)
- White Teeth (2000)
- The Vanishing Half (2020)

Target keywords: name, race, self, color, double, sin, shame, virtue, repent, rocket, earth, mars, colonize, soil, vanish, gone, half, missing, woman, freedom

[DEMO MODE - In practice, load actual book texts using:]
with open('book.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    analyzer.analyze_book(text, 'Book Title')
Analyzing Tristram Shandy...
Analyzing The Scarlet Letter...
Analyzing Sister Carrie...
Analyzing The Martian Chronicles...
Analyzing White Teeth...
Analyzing The Vanishing Half...

=== SAMPLE ANALYSIS RESULTS ===

Evolution table (first 10 rows):
  keyword                    book  year  frequency  relative_frequency  \
0    name         Tristram Shandy  1759          1            0.166667   
1    n