In [None]:
import networkx as nx
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from itertools import combinations
from collections import Counter
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def text_rank(text, window_size=2, top_n_keywords=10):
    # Tokenize and filter the text
    sentences = sent_tokenize(text)
    words = [word_tokenize(sentence) for sentence in sentences]
    filtered_words = [[word.lower() for word in sentence if word.lower() not in stopwords.words('english') and word not in string.punctuation] for sentence in words]

    # Count word frequency
    word_freq = Counter(word for sentence in filtered_words for word in sentence)

    # Build graph
    G = nx.Graph()
    for sentence in filtered_words:
        for pair in combinations(sentence, 2):
            if pair[0] == pair[1]:
                continue
            if G.has_edge(pair[0], pair[1]):
                G[pair[0]][pair[1]]['weight'] += 1
            else:
                G.add_edge(pair[0], pair[1], weight=1)

    # Apply TextRank
    scores = nx.pagerank(G, weight='weight')

    # Extract top N keywords
    top_keywords = sorted(scores, key=scores.get, reverse=True)[:top_n_keywords]
    return top_keywords

# Example usage
text = """Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language. In particular, it focuses on programming computers to process and analyze large amounts of natural language data."""
keywords = text_rank(text)
print(keywords)






---


import pandas as pd

# Create a sample DataFrame
data = {
    'text': [
        """Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language.""",
        """In particular, it focuses on programming computers to process and analyze large amounts of natural language data.""",
        """TextRank is an unsupervised graph-based ranking algorithm for natural language processing tasks, such as keyword extraction and text summarization."""
    ]
}
input_df = pd.DataFrame(data)

def process_dataframe(input_df, window_size=2, top_n_keywords=10):
    # Apply TextRank to each row in the DataFrame
    input_df['keywords'] = input_df['text'].apply(lambda x: text_rank(x, window_size, top_n_keywords))
    return input_df

output_df = process_dataframe(input_df)
print(output_df)


