## 3. Plot the Statistical Parser for Random Sentences with At Least 10 Words

For statistical parsing, NLTK’s Viterbi Parser or Probabilistic CKY can be used if you have a Probabilistic Context-Free Grammar (PCFG). However, for simplicity, we’ll use NLTK's built-in grammar and parser to show how statistical parsing can be done. Let’s create a basic grammar:

In [None]:
import pandas as pd
# Load the dataset
colnames = ['Sentiment', 'News Headline']
data = pd.read_csv("all-data.csv",  names=colnames, encoding="windows_1258")
data.style
df = data

In [None]:
from nltk import CFG, ChartParser

# Define a simple grammar
grammar = CFG.fromstring("""
  S -> NP VP
  NP -> Det N | Det N PP
  VP -> V NP | VP PP
  PP -> P NP
  Det -> 'the' | 'a'
  N -> 'man' | 'park' | 'bird' | 'dog'
  V -> 'sings' | 'walks'
  P -> 'with' | 'in'
""")

# Create a chart parser with the grammar
parser = ChartParser(grammar)

# Filter sentences with at least 10 words
long_sentences = df[df['News Headline'].apply(lambda x: len(x.split()) >= 10)]

# Select two random sentences with at least 10 words
sentence_1 = random.choice(long_sentences['News Headline'].tolist())
sentence_2 = random.choice(long_sentences['News Headline'].tolist())

# Select two random sentences with at least 10 words
sentence_3 = random.choice(long_sentences['News Headline'].tolist())
sentence_4 = random.choice(long_sentences['News Headline'].tolist())

def preprocess(text):
    sentences = nltk.sent_tokenize(text)                     # sentence segmentation
    sentences = [nltk.word_tokenize(s) for s in sentences]   # word tokenization
    # THIS LINE SEEMS TO BE THE ISSUE
    # sentences = [nltk.pos_tag(s) for s in sentences]         # part-of-speech tagger
    return sentences

# token_3 = preprocess(sentence_3)
# token_4 = preprocess(sentence_4)

# # len(token_3)

# for sentence in token_3:
#     for subsentence in sentence:
#         for tree in parser.parse(subsentence):
#             print(tree)



# Tokenize the sentences
tokens_3 = word_tokenize(sentence_3)
tokens_4 = word_tokenize(sentence_4)

# Parse the sentences and visualize the parse tree
for tree in parser.parse(tokens_3):
    tree.draw()

for tree in parser.parse(tokens_4):
    tree.draw()

print(f"Statistical parsing for sentence 1: {sentence_3}")
print(f"Statistical parsing for sentence 2: {sentence_4}")

# ValueError: Grammar does not cover some of the input words: "'Consumers', 'are', 'now', 'increasingly', 'interested', 'what', 'they', 'can', 'do', 'heir', 'phones', '.'".

In [None]:
Solution 1: Use a Probabilistic Context-Free Grammar (PCFG)
Instead of a manually defined grammar, you can use a probabilistic context-free grammar. NLTK provides a way to load such grammars. The simplest way to use PCFG is by loading an existing grammar from a treebank (e.g., Penn Treebank) that covers a wide variety of words.

Here’s how to adjust the parser to use a probabilistic approach:

First, download the required corpora, and load a PCFG grammar from the nltk_data package.

Replace the custom grammar with the Penn Treebank grammar or a similar pre-trained one.

Example Using NLTK’s Pre-Trained Grammar

In [None]:
from nltk import ViterbiParser
from nltk.grammar import Nonterminal
from nltk.corpus import treebank

# Download the necessary NLTK resources
nltk.download('treebank')

# Load the grammar from the treebank corpus
pcfg_grammar = nltk.induce_pcfg(Nonterminal('S'), treebank.parsed_sents())

# Use Viterbi Parser for probabilistic parsing
viterbi_parser = ViterbiParser(pcfg_grammar)

# Select two random sentences with at least 10 words
sentence_3 = random.choice(long_sentences['News Headline'].tolist())
sentence_4 = random.choice(long_sentences['News Headline'].tolist())

# Tokenize the sentences
tokens_3 = word_tokenize(sentence_3)
tokens_4 = word_tokenize(sentence_4)

# Parse the sentences using Viterbi Parser
for tree in viterbi_parser.parse(tokens_3):
    tree.pretty_print()

for tree in viterbi_parser.parse(tokens_4):
    tree.pretty_print()

print(f"Statistical parsing for sentence 1: {sentence_3}")
print(f"Statistical parsing for sentence 2: {sentence_4}")


In [None]:
import nltk
from nltk.grammar import Nonterminal
from nltk.corpus import treebank
from nltk.grammar import Production, Nonterminal

# Download necessary resources
nltk.download('treebank')

# Extract productions from the treebank parsed sentences
treebank_productions = []
for tree in treebank.parsed_sents():
    treebank_productions += tree.productions()

# --------------------
# Extend grammar to handle proper nouns, numbers, and symbols
custom_productions = [
    Production(Nonterminal('S'), [Nonterminal('NP'), Nonterminal('VP')]),
    Production(Nonterminal('NP'), [Nonterminal('Det'), Nonterminal('N')]),
    Production(Nonterminal('VP'), [Nonterminal('V'), Nonterminal('NP')]),
    Production(Nonterminal('Det'), ['the', 'a']),
    Production(Nonterminal('N'), ['market', 'stocks', 'prices', 'company', 'Amanda', 'Russia', 'USD5', 
                                 'Cramo', 'rental', 'provider', 'Latvian', 'paradoxical']),
    Production(Nonterminal('V'), ['rises', 'plummets', 'soars', 'drops']),
    Production(Nonterminal('NUM'), ['.0', '5', 'm']),
]

# Add these custom productions to the existing productions
all_productions = treebank_productions + custom_productions

# Induce the PCFG with the additional rules
pcfg_grammar = nltk.induce_pcfg(Nonterminal('S'), all_productions)

# Create a nonterminal for the starting symbol 'S'
S = Nonterminal('S')

# # Induce the PCFG from the list of productions
# pcfg_grammar = nltk.induce_pcfg(S, productions)

# Use Viterbi Parser for probabilistic parsing
from nltk.parse import ViterbiParser
viterbi_parser = ViterbiParser(pcfg_grammar)

# Select two random sentences with at least 10 words
import random
sentence_3 = random.choice(df[df['News Headline'].apply(lambda x: len(x.split()) >= 10)]['News Headline'].tolist())
sentence_4 = random.choice(df[df['News Headline'].apply(lambda x: len(x.split()) >= 10)]['News Headline'].tolist())

# Tokenize the sentences
tokens_3 = nltk.word_tokenize(sentence_3)
tokens_4 = nltk.word_tokenize(sentence_4)

# Parse the sentences using Viterbi Parser and display the parse tree
for tree in viterbi_parser.parse(tokens_3):
    tree.pretty_print()

for tree in viterbi_parser.parse(tokens_4):
    tree.pretty_print()


In [None]:
import nltk
from nltk.grammar import Nonterminal
from nltk.corpus import treebank
from nltk.grammar import Production, Nonterminal

# Download necessary resources
nltk.download('treebank')

# Extract productions from the treebank parsed sentences
treebank_productions = []
for tree in treebank.parsed_sents():
    treebank_productions += tree.productions()

# --------------------
# Extend grammar to handle proper nouns, numbers, and symbols
custom_productions = [
    Production(Nonterminal('S'), [Nonterminal('NP'), Nonterminal('VP')]),
    Production(Nonterminal('NP'), [Nonterminal('Det'), Nonterminal('N')]),
    Production(Nonterminal('VP'), [Nonterminal('V'), Nonterminal('NP')]),
    Production(Nonterminal('Det'), ['the', 'a']),
    Production(Nonterminal('N'), ['market', 'stocks', 'prices', 'company', 'Amanda', 'Russia', 'USD5', 
                                 'Cramo', 'rental', 'provider', 'Latvian']),
    Production(Nonterminal('V'), ['rises', 'plummets', 'soars', 'drops']),
    Production(Nonterminal('NUM'), ['.0', '5', 'm']),
]

# Add a rule to handle unknown words (represented as 'UNK')
custom_productions.append(Production(Nonterminal('N'), ['UNK']))

# Identify OOV words and replace them with 'UNK' before parsing
def replace_oov(tokens, known_vocabulary):
    return [token if token in known_vocabulary else 'UNK' for token in tokens]

# Add these custom productions to the existing productions
all_productions = treebank_productions + custom_productions

# Define known vocabulary from grammar
known_vocabulary = set([prod.rhs()[0] for prod in all_productions if isinstance(prod.rhs()[0], str)])

# Create a nonterminal for the starting symbol 'S'
S = Nonterminal('S')

# # Induce the PCFG with the additional rules from the list of productions
pcfg_grammar = nltk.induce_pcfg(S, productions)

# Use Viterbi Parser for probabilistic parsing
from nltk.parse import ViterbiParser
viterbi_parser = ViterbiParser(pcfg_grammar)

# Select two random sentences with at least 10 words
import random
sentence_3 = random.choice(df[df['News Headline'].apply(lambda x: len(x.split()) >= 10)]['News Headline'].tolist())
sentence_4 = random.choice(df[df['News Headline'].apply(lambda x: len(x.split()) >= 10)]['News Headline'].tolist())

# Tokenize the sentences
tokens_3 = nltk.word_tokenize(sentence_3)
tokens_4 = nltk.word_tokenize(sentence_4)

# Replace OOV words in tokens_3 and tokens_4
tokens_3 = replace_oov(tokens_3, known_vocabulary)
tokens_4 = replace_oov(tokens_4, known_vocabulary)

# Parse the sentences using Viterbi Parser and display the parse tree
for tree in viterbi_parser.parse(tokens_3):
    tree.pretty_print()

for tree in viterbi_parser.parse(tokens_4):
    tree.pretty_print()

# print (tokens_3)
# print (tokens_4)

Solution 2: Use an Existing Parser from the Stanford Parser
Another solution is to use an external parser like the Stanford Parser, which handles statistical parsing for a wide variety of sentences without needing custom grammar definitions. The Stanford Parser integrates well with NLTK and is trained on large datasets like the Penn Treebank.

In [None]:
# Install Stanford Parser with NLTK
nltk.download('stanford-parser')
nltk.download('stanford-parser-3.9.2-models')


In [None]:
from nltk.parse.stanford import StanfordParser

filepath = 'C:/Users/usha_/stanford-parser-4.2.0/stanford-parser-full-2020-11-17/'
# Specify the path to the jar file for Stanford Parser
path_to_jar = filepath + 'stanford-parser.jar'
# path_to_models_jar = 'stanford-parser-3.9.2-models.jar'
path_to_models_jar = filepath + 'stanford-parser-4.2.0-models.jar'

# Create Stanford Parser object
stanford_parser = StanfordParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

# Parse the sentences using Stanford Parser
result_1 = list(stanford_parser.raw_parse(sentence_3))
result_2 = list(stanford_parser.raw_parse(sentence_4))

# Display the results as trees
for tree in result_1:
    tree.pretty_print()

for tree in result_2:
    tree.pretty_print()


In [None]:
#  Errors out 
# 3. Plot the Statistical parser for any two random sentences with at least 10 words
# For statistical parsing, we can use the nltk library. 
import nltk
from nltk.parse import CoreNLPParser

# Filter sentences with at least 10 words
long_sentences = [sent for sent in data['News Headline'] if len(sent.split()) >= 10]

# Select two random sentences
random_sentences = random.sample(long_sentences, 2)

# Initialize the CoreNLP parser
parser = CoreNLPParser(url='https://localhost:9000')

# Plot statistical parser for the same random sentences
for sentence in random_sentences:
    parse = next(parser.raw_parse(sentence))
    tree = nltk.Tree.fromstring(str(parse))
    tree.pretty_print()
    tree.draw()


In [None]:
import nltk
from nltk import CFG
from nltk.parse import ViterbiParser
from nltk.grammar import PCFG

# Define a probabilistic context-free grammar (PCFG)
pcfg = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.8] | 'John' [0.2]
    VP -> V NP [1.0]
    Det -> 'the' [0.6] | 'a' [0.4]
    N -> 'cat' [0.5] | 'dog' [0.5]
    V -> 'chased' [0.7] | 'saw' [0.3]
""")

# Instantiate the Viterbi parser using the PCFG
viterbi_parser = ViterbiParser(pcfg)

# Parse a sentence
sentence = ['John', 'saw', 'the', 'dog']
for tree in viterbi_parser.parse(sentence):
    print(tree)
    tree.pretty_print()
    tree.draw()


##  Implementing Probabilistic CKY Parser

In [None]:
import nltk
from nltk import CFG, PCFG
from nltk.parse import pchart

# Define a PCFG in CNF
pcfg = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.6] | 'John' [0.4]
    VP -> V NP [0.7] | V [0.3]
    Det -> 'the' [0.8] | 'a' [0.2]
    N -> 'cat' [0.5] | 'dog' [0.5]
    V -> 'chased' [0.7] | 'saw' [0.3]
""")

# CKY (Probabilistic Chart) parser from NLTK
parser = pchart.InsideChartParser(pcfg)

# Sentence to parse
# sentence = ['John', 'saw', 'the', 'animal', 'and', 'chased', 'it']
sentence = ['John', 'saw', 'the', 'dog']

# Parse the sentence using the CKY algorithm
for tree in parser.parse(sentence):
    print(tree)
    tree.pretty_print()
    tree.draw()


(S (NP John) (VP (V saw) (NP (Det the) (N dog)))) (p=0.02016)
          S             
  ________|___           
 |            VP        
 |     _______|___       
 |    |           NP    
 |    |        ___|___   
 NP   V      Det      N 
 |    |       |       |  
John saw     the     dog



In [None]:
#  Works but DONT NEED THIS 

import spacy

# Load the pre-trained statistical model from spaCy
nlp = spacy.load("en_core_web_sm")

# Parse a sentence
doc = nlp("John saw the animal and it was running.")

# Print dependency parse
for token in doc:
    print(f'{token.text} -> {token.dep_} -> {token.head.text}')

# Visualize the dependency tree
spacy.displacy.serve(doc, style="dep")


John -> nsubj -> saw
saw -> ROOT -> saw
the -> det -> animal
animal -> dobj -> saw
and -> cc -> saw
it -> nsubj -> running
was -> aux -> running
running -> conj -> saw
. -> punct -> running





Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



### 2. Learning a Grammar from Data

If you still want to generate a grammar based on the structure of your text data, there are methods to automatically extract a grammar from annotated corpora. You can train a PCFG or a dependency grammar from a dataset that has syntactic annotations, such as the Penn Treebank. The process involves learning how rules (e.g., subject-verb-object relationships) appear in text.

Steps to Learn a Grammar:
Annotated Corpora:

Use corpora like the Penn Treebank, Universal Dependencies, or other treebanks where sentences are annotated with syntactic structures.
Inducing PCFG:

You can extract a PCFG by counting occurrences of production rules in a corpus. For example, if S -> NP VP occurs frequently, it will have a high probability.
Example in Python (Learning a Grammar from a Corpus):
Train a PCFG: Use NLTK's support for treebank parsing, which allows you to extract a PCFG from a parsed corpus.

In [None]:

import nltk
from nltk.corpus import treebank
from nltk.grammar import induce_pcfg, Nonterminal

# Load the treebank data
nltk.download('treebank')

# Get the productions (rules) from the Penn Treebank
productions = []
for tree in treebank.parsed_sents():
    productions += tree.productions()

# Define the start symbol
S = Nonterminal('S')

# Induce a PCFG from the productions
pcfg = induce_pcfg(S, productions)

# Print some rules with probabilities
for production in pcfg.productions()[:10]:
    print(production)


5. Using PCFG from Large Corpora
If you want to avoid writing a grammar yourself, there are ready-to-use grammars derived from large corpora. For example:

Penn Treebank Grammar: You can directly use PCFG grammars derived from corpora like the Penn Treebank.
In NLTK, you can use corpora like the Brown Corpus or Penn Treebank that have pre-built grammars.

In [None]:
import nltk
from nltk.corpus import treebank

# Load the treebank corpus
nltk.download('treebank')

# Get parsed sentences
parsed_sents = treebank.parsed_sents()

# Print the first parsed sentence
print(parsed_sents[0])
parsed_sents[0].draw()  # Visualize the parse tree


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\usha_\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


In [8]:
!pip install ipywidgets

Defaulting to user installation because normal site-packages is not writeable


In [14]:
# JupyterLab widgets extension 
!pip install jupyterlab_widgets

Defaulting to user installation because normal site-packages is not writeable


In [16]:
# Enable the extension
jupyter labextension install @jupyter-widgets/jupyterlab-manager

SyntaxError: invalid syntax (602586952.py, line 2)

In [18]:
!pip install widgetsnbextension

Defaulting to user installation because normal site-packages is not writeable


In [28]:
jupyter nbextension enable --py widgetsnbextension

SyntaxError: invalid syntax (2269694994.py, line 1)

In [None]:
!pip install --upgrade matplotlib networkx

In [34]:
import stanza
import pandas as pd
import random
import matplotlib.pyplot as plt
import networkx as nx
# download_method=DownloadMethod.REUSE_RESOURCES

# Download and load the English models for Stanza
stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,depparse')

import pandas as pd
# Load the dataset
colnames = ['Sentiment', 'News Headline']
data = pd.read_csv("all-data.csv",  names=colnames, encoding="windows_1258")
df = data
# Filter sentences with at least 10 words
long_sentences = df[df['News Headline'].str.split().str.len() >= 10]['News Headline']

# Randomly select two sentences
random_sentences = random.sample(long_sentences.tolist(), 2)

def plot_dependency_tree(sentence, index):
    # Parse sentence
    doc = nlp(sentence)
    
    # Create a directed graph
    G = nx.DiGraph()

    # Add nodes and edges
    for sent in doc.sentences:
        for word in sent.words:
            G.add_node(word.id, label=word.text)
            if word.head > 0:
                G.add_edge(word.head, word.id, label=word.deprel)

    # Draw the graph
    pos = nx.spring_layout(G)
    plt.figure(figsize=(12, 8))
    nx.draw(G, pos, with_labels=True, labels=nx.get_node_attributes(G, 'label'),
            node_size=3000, node_color='skyblue', font_size=10, font_weight='bold',
            arrows=True, arrowsize=20, edge_color='gray')
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')
    plt.title(f'Sentence {index+1} Dependency Tree')
    plt.show()

# Plot dependency trees for the randomly selected sentences
for i, sentence in enumerate(random_sentences):
    plot_dependency_tree(sentence, i+1)


NameError: name 'DownloadMethod' is not defined

In [None]:
"C:\Users\usha_\stanford-parser-4.2.0\stanford-parser-full-2020-11-17"

stanford-parser.jar
stanford-parser-3.9.2-models.jar

ref: https://nlp.stanford.edu/software/lex-parser.html#Download


Using statistical parsing 

Description of Data:
This dataset (FinancialPhraseBank) contains the sentiments for financial news headlines from the perspective of a retail investor. The dataset contains two columns, "Sentiment" and "News Headline". The sentiment can be negative, neutral or positive.

Task : Implementing Parsing Algorithms
Plot the Statistical parser for any two random sentences from the entire corpus/dataset that has at least 10 words in the sentence. Make sure that Statistical parser looks good and should visually understandable.



In [4]:
import pandas as pd
# Load the dataset
colnames = ['Sentiment', 'News Headline']
data = pd.read_csv("all-data.csv",  names=colnames, encoding="windows_1258")
data.style
# df = data

Unnamed: 0,Sentiment,News Headline
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said ."
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported ."
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales ."
5,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing its growth strategy by increasingly focusing on technologically more demanding HDI printed circuit boards PCBs .
6,positive,"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m ."
7,positive,"In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn ."
8,positive,Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing 7.7 % of net sales .
9,positive,"Operating profit totalled EUR 21.1 mn , up from EUR 18.6 mn in 2007 , representing 9.7 % of net sales ."


In [None]:
import random
import pandas as pd
import nltk
from nltk.parse.stanford import StanfordParser
from nltk.tree import Tree
import os

os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk-22/'
from IPython.core.interactiveshell import InteractiveShell

# Increase the output display limit
InteractiveShell.ast_node_interactivity = "all"

# Filter out sentences with at least 10 words
data['Word Count'] = data['News Headline'].apply(lambda x: len(str(x).split()))
long_sentences = data[data['Word Count'] >= 10]

# Select two random sentences with at least 10 words
selected_sentences = long_sentences.sample(2)['News Headline'].tolist()
print (f"The selected sentences are:\n{selected_sentences}")

# Set up the Stanford Parser (make sure the environment variables are set correctly)
# Path to Stanford parser .jar files
stanford_parser_dir = "C:/stanford-nlp-resources/stanford-parser-4.2.0/stanford-parser-full-2020-11-17/"
parser = StanfordParser(
    path_to_models_jar=f'{stanford_parser_dir}stanford-parser-4.2.0-models.jar',
    path_to_jar=f'{stanford_parser_dir}stanford-parser.jar'
)

# Parse the sentences and visualize
for sentence in selected_sentences:
    parse_tree = list(parser.raw_parse(sentence))
    # Display the parse tree
    for tree in parse_tree:
        tree.pretty_print()  # Print the tree structure
        tree.draw()  # This will open a window to visualize the parse tree
        # Generate a DOT representation of the tree
        dot_string = nltk.draw.util.tree_to_dot(tree)
        # Save the tree to a DOT file
        with open("parse_tree.dot", "w") as f:
            f.write(dot_string.to_string())

['The lay-offs will start gradually and will last from one to six weeks .', "In the second quarter of 2009 , net sales through operator business partners represented 47 % of the Group 's total net sales ."]


Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.
  parser = StanfordParser(


                                           ROOT                                             
                                            |                                                
                                            S                                               
      ______________________________________|_____________________________________________   
     |                                      VP                                            | 
     |                    __________________|_________                                    |  
     |                   |                  |         VP                                  | 
     |                   |                  |     ____|_____________                      |  
     |                   VP                 |    |                  VP                    | 
     |              _____|____              |    |     _____________|_______              |  
     |             |          VP            |    |    |         P

In [None]:
import matplotlib.pyplot as plt
from nltk.draw.tree import TreeWidget
from nltk.draw.util import CanvasFrame

# Create a CanvasFrame for the tree
cf = CanvasFrame()
tc = TreeWidget(cf.canvas(), tree)
cf.add_widget(tc)

# Save the visualization
cf.print_to_file('parse_tree.ps')

# Convert PS to PNG using matplotlib
from matplotlib import pyplot as plt
import matplotlib.image as mpimg

# Display the tree in PNG format
img = mpimg.imread('parse_tree.png')
plt.imshow(img)
plt.axis('off')  # Hide axes
plt.show()


In [None]:
# Using core NLP 

import random
import pandas as pd
import nltk
from nltk.parse.corenlp import CoreNLPParser
from nltk.tree import Tree

# Load your dataset
# data = pd.read_csv('FinancialPhraseBank.csv')  # Replace with actual path

# Filter sentences with at least 10 words
data['Word Count'] = data['News Headline'].apply(lambda x: len(str(x).split()))
long_sentences = data[data['Word Count'] >= 10]

# Select two random sentences with at least 10 words
selected_sentences = long_sentences.sample(2)['News Headline'].tolist()

# Set up the CoreNLPParser to use the running Stanford CoreNLP server
parser = CoreNLPParser(url='http://localhost:9000')

# Parse and visualize each selected sentence
for sentence in selected_sentences:
    parse_tree = next(parser.raw_parse(sentence))  # Parse the sentence
    # Display the parse tree
    parse_tree.pretty_print()  # Print the tree structure
    parse_tree.draw()  # This will open a window to visualize the parse tree
