# How to get started with the Wikimedia data

The [Wikipedia Structured Contents](https://www.kaggle.com/datasets/wikimedia-foundation/wikipedia-structured-contents) dataset on Kaggle contains all articles of the English and French language editions of Wikipedia, pre-parsed and outputted as structured JSON files with a consistent schema. Each JSON line holds the content of one full Wikipedia article stripped of extra markdown and non-prose sections (references, etc.).

The data is divided into multiple large `.jsonl` files. Here we will explore just one of those files: `./wikipedia-structured-contents/enwiki_namespace_0/enwiki_namespace_0_0.jsonl`.

## Initial Setup


In [None]:
%matplotlib inline
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os
from tqdm import tqdm
from datetime import datetime
import re
from wordcloud import WordCloud
import networkx as nx
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100
sns.set(style="whitegrid")

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

## Load Data

In [None]:
def read_jsonl(file_path, max_records=None):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f)):
            if max_records and i >= max_records:
                break
            data.append(json.loads(line))
    return data

In [None]:
import kagglehub
file_path = kagglehub.dataset_download("wikimedia-foundation/wikipedia-structured-contents",path="enwiki_namespace_0/enwiki_namespace_0_0.jsonl")
data = read_jsonl(file_path)
print(f"Successfully loaded {len(data)} records")

## Preview Data

In [None]:
df = pd.DataFrame(data)
print(f"Dataset shape: {df.shape}")
print("\nColumns in the dataset:")
for col in df.columns:
    print(f"- {col}")

In [None]:
print("\nSample data:")
display(df.head())

## Explore Dates

In [None]:
date_fields = ['date_modified']
for date_field in date_fields:
    if date_field in df.columns:
        try:
            df[date_field] = pd.to_datetime(df[date_field])
            df[f'{date_field}_year'] = df[date_field].dt.year
            df[f'{date_field}_month'] = df[date_field].dt.month
            df[f'{date_field}_day'] = df[date_field].dt.day
            df[f'{date_field}_hour'] = df[date_field].dt.hour
            df[f'{date_field}_weekday'] = df[date_field].dt.day_name()
        except Exception as e:
            print(f"Could not convert {date_field} to datetime: {e}")

for date_field in date_fields:
    if date_field in df.columns:
        
        # Distribution by year
        if f'{date_field}_year' in df.columns:
            plt.figure(figsize=(14, 6))
            year_counts = df[f'{date_field}_year'].value_counts().sort_index()
            year_counts.plot(kind='bar')
            plt.title(f'Distribution by Year ({date_field})', fontsize=14)
            plt.xlabel('Year', fontsize=12)
            plt.ylabel('Count', fontsize=12)
            plt.xticks(rotation=45)
            plt.show()
        
        # Distribution by month
        if f'{date_field}_month' in df.columns:
            plt.figure(figsize=(14, 6))
            month_counts = df[f'{date_field}_month'].value_counts().sort_index()
            month_names = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 
                           7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
            month_counts.index = month_counts.index.map(lambda x: month_names.get(x, x))
            month_counts.plot(kind='bar')
            plt.title(f'Distribution by Month ({date_field})', fontsize=14)
            plt.xlabel('Month', fontsize=12)
            plt.ylabel('Count', fontsize=12)
            plt.show()
        
        # Distribution by day of week
        if f'{date_field}_weekday' in df.columns:
            plt.figure(figsize=(14, 6))
            day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
            weekday_counts = df[f'{date_field}_weekday'].value_counts()
            weekday_counts = weekday_counts.reindex(day_order)
            weekday_counts.plot(kind='bar')
            plt.title(f'Distribution by Day of Week ({date_field})', fontsize=14)
            plt.xlabel('Day of Week', fontsize=12)
            plt.ylabel('Count', fontsize=12)
            plt.show()
        
        # Distribution by hour
        if f'{date_field}_hour' in df.columns:
            plt.figure(figsize=(14, 6))
            hour_counts = df[f'{date_field}_hour'].value_counts().sort_index()
            hour_counts.plot(kind='bar')
            plt.title(f'Distribution by Hour of Day ({date_field})', fontsize=14)
            plt.xlabel('Hour', fontsize=12)
            plt.ylabel('Count', fontsize=12)
            plt.show()

## Explore Names

In [None]:
# Analysis for name field
if 'name' in df.columns:
    
    # Word frequency in names
    all_names = ' '.join(df['name'].dropna().astype(str))
    words = re.findall(r'\b\w+\b', all_names.lower())
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
    name_word_freq = Counter(filtered_words)
    
    # Display top words
    print("Most common words in names:")
    for word, count in name_word_freq.most_common(15):
        print(f"  {word}: {count}")
    
    # Word cloud for names
    plt.figure(figsize=(14, 7))
    name_cloud = WordCloud(width=800, height=400,
                         background_color='white',
                         max_words=100).generate(' '.join(filtered_words))
    plt.imshow(name_cloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Names', fontsize=14)
    plt.show()

## Explore Descriptions

In [None]:
# Analysis for description field
if 'description' in df.columns:
    
    # Handle descriptions as strings or lists/dicts
    descriptions = []
    for desc in df['description'].dropna():
        if isinstance(desc, str):
            descriptions.append(desc)
        elif isinstance(desc, list):
            descriptions.extend([d for d in desc if isinstance(d, str)])
    
    # Word frequency in descriptions
    all_descriptions = ' '.join(descriptions)
    words = re.findall(r'\b\w+\b', all_descriptions.lower())
    filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
    desc_word_freq = Counter(filtered_words)
    
    # Display top words
    print("Most common words in descriptions:")
    for word, count in desc_word_freq.most_common(15):
        print(f"  {word}: {count}")
    
    # Word cloud for descriptions
    plt.figure(figsize=(14, 7))
    desc_cloud = WordCloud(width=800, height=400,
                         background_color='white',
                         max_words=100).generate(' '.join(filtered_words))
    plt.imshow(desc_cloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Descriptions', fontsize=14)
    plt.show()
    
    # Description length analysis
    desc_lengths = [len(d) for d in descriptions]
    plt.figure(figsize=(14, 6))
    sns.histplot(desc_lengths, bins=20, kde=True)
    plt.title('Distribution of Description Lengths', fontsize=14)
    plt.xlabel('Character Length', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.show()
    
    print(f"Average description length: {np.mean(desc_lengths):.1f} characters")

## Explore Abstracts

In [None]:
if 'abstract' in df.columns:
    
    # Handle abstracts as strings or lists/dicts
    abstracts = []
    for abstract in df['abstract'].dropna():
        if isinstance(abstract, str):
            abstracts.append(abstract)
        elif isinstance(abstract, list):
            abstracts.extend([a for a in abstract if isinstance(a, str)])

    abstract_lengths = [len(a) for a in abstracts]
    print(f"Average abstract length: {np.mean(abstract_lengths):.1f} characters")
    
    # Word cloud for abstracts
    if abstracts:
        all_abstracts = ' '.join(abstracts)
        words = re.findall(r'\b\w+\b', all_abstracts.lower())
        filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
        
        plt.figure(figsize=(14, 7))
        abstract_cloud = WordCloud(width=800, height=400,
                                 background_color='white',
                                 max_words=100).generate(' '.join(filtered_words))
        plt.imshow(abstract_cloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Abstracts', fontsize=14)
        plt.show()

## Explore Sections

In [None]:
if 'sections' in df.columns:
    
    # Count number of sections per record
    section_counts = []
    section_names = []
    
    for sections in df['sections'].dropna():
        if isinstance(sections, list):
            section_counts.append(len(sections))
            # Extract section names (assuming sections is a list of dicts with 'name' key)
            for section in sections:
                if isinstance(section, dict) and 'name' in section:
                    section_names.append(section['name'])
    
    # Top section names
    if section_names:
        section_name_counts = Counter(section_names)
        
        plt.figure(figsize=(14, 7))
        pd.Series(section_name_counts).nlargest(15).plot(kind='bar')
        plt.title('Top 15 Section Names', fontsize=14)
        plt.xlabel('Section Name', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.show()
        
        display(pd.Series(section_name_counts).nlargest(15))


## Explore Images

In [None]:
if 'image' in df.columns:
    
    # Count number of records with images
    image_present = df['image'].notna()
    image_count = image_present.sum()
    no_image_count = (~image_present).sum()
    
    print(f"Records with images: {image_count} ({image_count/len(df)*100:.1f}%)")
    print(f"Records without images: {no_image_count} ({no_image_count/len(df)*100:.1f}%)")
    
    # Visualize image presence distribution
    plt.figure(figsize=(10, 6))
    plt.bar(['Has Image', 'No Image'], [image_count, no_image_count], color=['green', 'red'])
    plt.title('Distribution of Records with and without Images', fontsize=14)
    plt.ylabel('Count', fontsize=12)
    plt.show()
