In [8]:
# !pip install pandas nltk gensim sumy tqdm
# !pip install sumy



In [10]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
import nltk
from tqdm.notebook import tqdm
import numpy as np

nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to /Users/joshawa-
[nltk_data]     ao/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [20]:
import json

with open("part_1/disrupt_articles.json", "r") as f:
    raw = f.read()

segments = raw.split("][")  # split where JSON arrays are glued together
cleaned = []

for i, s in enumerate(segments):
    if i == 0:
        s = s + "]"
    elif i == len(segments) - 1:
        s = "[" + s
    else:
        s = "[" + s + "]"

    try:
        cleaned.extend(json.loads(s))
    except json.JSONDecodeError as e:
        print(f"Skipping bad segment: {e}")

df = pd.DataFrame(cleaned)
df = df[df['content'].str.strip() != ""]
df.reset_index(drop=True, inplace=True)
df.head()


Unnamed: 0,title,author,date,url,content
0,Step into the spotlight: Apply to speak at Tec...,,2025-03-24T08:17:30-07:00,https://techcrunch.com/2025/03/24/step-into-th...,"Calling all tech innovators, startup fanatics,..."
1,TechCrunch Disrupt 2025 tickets now on sale: L...,,2025-01-24T11:39:20-08:00,https://techcrunch.com/2025/01/24/techcrunch-d...,We’re kicking things off earlier than ever! ...
2,‘Tesla Takedown’ protesters are planning a glo...,,2025-03-28T14:47:15-07:00,https://techcrunch.com/2025/03/28/tesla-takedo...,“Tesla Takedown” organizers have promised thei...
3,Google rolls out new vacation-planning feature...,,2025-03-27T06:00:00-07:00,https://techcrunch.com/2025/03/27/google-rolls...,Google is rolling out a slew of new features —...
4,OpenAI’s viral Studio Ghibli moment highlights...,,2025-03-26T16:23:09-07:00,https://techcrunch.com/2025/03/26/openais-vira...,"It’s only been a day since , and social media..."


In [28]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.tokenizers import Tokenizer

nltk.download('punkt')
nltk.download('vader_lexicon')


[nltk_data] Downloading package punkt to /Users/joshawa-
[nltk_data]     ao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /Users/joshawa-
[nltk_data]     ao/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [30]:
# Initialize tools
sia = SentimentIntensityAnalyzer()
summarizer = TextRankSummarizer()
tokenizer = Tokenizer("english")
keywords = ['disrupt', 'startup', 'tech', 'funding', 'ai', 'innovation', 'launch']

# Helper functions
def summarize(text):
    try:
        parser = PlaintextParser.from_string(text, tokenizer)
        summary = summarizer(parser.document, 2)
        return " ".join(str(s) for s in summary)
    except:
        return text[:250] + "..."

def importance_score(text):
    text_lower = text.lower()
    return sum(text_lower.count(word) for word in keywords)

In [32]:
results = []

for _, row in df.iterrows():
    content = row['content']
    summary = summarize(content)
    sentiment = sia.polarity_scores(content)
    direction = "positive" if sentiment['compound'] > 0 else "negative"
    score = importance_score(content)

    results.append({
        "Title": row['title'],
        "Summary": summary,
        "Importance Score": score,
        "Direction": direction
    })

final_df = pd.DataFrame(results)
final_df.head()


Unnamed: 0,Title,Summary,Importance Score,Direction
0,Step into the spotlight: Apply to speak at Tec...,We’re curating a diverse group of industry exp...,13,positive
1,TechCrunch Disrupt 2025 tickets now on sale: L...,"From now through January 31, take advantage of...",19,positive
2,‘Tesla Takedown’ protesters are planning a glo...,“The reason that [Musk] is in the position tha...,14,positive
3,Google rolls out new vacation-planning feature...,"Starting this week, users can search for somet...",10,positive
4,OpenAI’s viral Studio Ghibli moment highlights...,"In a statement to TechCrunch, an OpenAI spokes...",40,positive
