LEVEL-1

In [None]:
# Day 19: 30 Days of Python - File Handling and Text Analysis

import json
import re
from collections import Counter
from difflib import SequenceMatcher

# -----------------------------
#  Exercise Level 1
# -----------------------------

# Count number of lines and words in a text file
def count_lines_and_words(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
        lines = text.splitlines()
        words = re.findall(r'\b\w+\b', text)
        return len(lines), len(words)


files = [
    './data/obama_speech.txt',
    './data/michelle_obama_speech.txt',
    './data/donald_speech.txt',
    './data/melina_trump_speech.txt'
]

for file in files:
    lines, words = count_lines_and_words(file)
    print(f"{file}: {lines} lines, {words} words")


# -----------------------------
# Most spoken languages
# -----------------------------

def most_spoken_languages(filename, top_n):
    with open(filename, 'r', encoding='utf-8') as f:
        countries = json.load(f)
    languages = []
    for country in countries:
        languages.extend(country['languages'])
    lang_counts = Counter(languages)
    return lang_counts.most_common(top_n)


print("\nMost spoken languages (top 10):")
print(most_spoken_languages('./data/countries_data.json', 10))

print("\nMost spoken languages (top 3):")
print(most_spoken_languages('./data/countries_data.json', 3))


# -----------------------------
# Most populated countries
# -----------------------------

def most_populated_countries(filename, top_n):
    with open(filename, 'r', encoding='utf-8') as f:
        countries = json.load(f)
    sorted_countries = sorted(countries, key=lambda x: x['population'], reverse=True)
    top_countries = [{'country': c['name'], 'population': c['population']} for c in sorted_countries[:top_n]]
    return top_countries


print("\nMost populated countries (top 10):")
print(most_populated_countries('./data/countries_data.json', 10))

print("\nMost populated countries (top 3):")
print(most_populated_countries('./data/countries_data.json', 3))


LEVEL-2

In [None]:
# -----------------------------
# Extract all email addresses
# -----------------------------

def extract_emails(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
    return re.findall(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}', text)


emails = extract_emails('./data/email_exchange_big.txt')
print("\nExtracted emails:", emails[:10], '...')  # show first 10
print("Total emails found:", len(emails))


In [None]:
# -----------------------------
# Most common words in a file
# -----------------------------

def find_most_common_words(filename, n):
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read().lower()
        words = re.findall(r'\b\w+\b', text)
        return Counter(words).most_common(n)


print("\nTop 10 words in sample.txt:")
print(find_most_common_words('./data/sample.txt', 10))

print("\nTop 5 words in sample.txt:")
print(find_most_common_words('./data/sample.txt', 5))


In [None]:
# -----------------------------
# Most frequent words in speeches
# -----------------------------

def most_frequent_words(filename, n=10):
    return find_most_common_words(filename, n)

print("\nObama’s top 10 words:", most_frequent_words('./data/obama_speech.txt'))
print("Michelle’s top 10 words:", most_frequent_words('./data/michelle_obama_speech.txt'))
print("Trump’s top 10 words:", most_frequent_words('./data/donald_speech.txt'))
print("Melina’s top 10 words:", most_frequent_words('./data/melina_trump_speech.txt'))


In [None]:
# -----------------------------
# Text similarity
# -----------------------------

def clean_text(text):
    return re.sub(r'[^A-Za-z\s]', '', text).lower()

def check_text_similarity(file1, file2):
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
        text1, text2 = clean_text(f1.read()), clean_text(f2.read())
    similarity = SequenceMatcher(None, text1, text2).ratio()
    return round(similarity * 100, 2)


similarity_score = check_text_similarity('./data/michelle_obama_speech.txt', './data/melina_trump_speech.txt')
print(f"\nText similarity between Michelle and Melina speeches: {similarity_score}%")


In [None]:
# -----------------------------
# Top repeated words in Romeo and Juliet
# -----------------------------

print("\nTop 10 words in Romeo and Juliet:")
print(find_most_common_words('./data/romeo_and_juliet.txt', 10))


In [None]:
# -----------------------------
# Hacker News CSV word frequency
# -----------------------------

def count_keyword_occurrences(filename, keyword):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return sum(1 for line in lines if re.search(keyword, line, re.IGNORECASE))

python_count = count_keyword_occurrences('./data/hacker_news.csv', r'\bpython\b')
js_count = count_keyword_occurrences('./data/hacker_news.csv', r'\bjavascript\b')
java_count = count_keyword_occurrences('./data/hacker_news.csv', r'\bjava\b(?!script)')

print("\nLines containing 'Python':", python_count)
print("Lines containing 'JavaScript':", js_count)
print("Lines containing 'Java' but not 'JavaScript':", java_count)
