In [None]:
# ======================================================
# Part 1: Data Loading and Basic Exploration
# ======================================================

import pandas as pd

# Load dataset
df = pd.read_csv("metadata.csv")

# Explore dataset
print("Shape of dataset:", df.shape)
print("\nColumn info:\n")
print(df.info())
print("\nFirst 5 rows:\n")
print(df.head())
print("\nSummary statistics for numeric columns:\n")
print(df.describe())

# ======================================================
# Part 2: Data Cleaning and Preparation
# ======================================================

# Check missing values
print("\nMissing values per column:\n")
print(df.isnull().sum().sort_values(ascending=False).head(20))

# Drop rows missing essential info
df_clean = df.dropna(subset=["title", "abstract", "publish_time"])

# Convert publish_time to datetime
df_clean["publish_time"] = pd.to_datetime(df_clean["publish_time"], errors="coerce")

# Extract year
df_clean["year"] = df_clean["publish_time"].dt.year

# Add abstract word count
df_clean["abstract_word_count"] = df_clean["abstract"].str.split().str.len()

print("\nCleaned dataset shape:", df_clean.shape)

# ======================================================
# Part 3: Data Analysis and Visualization
# ======================================================

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# --- Publications per Year ---
pubs_per_year = df_clean["year"].value_counts().sort_index()
plt.figure(figsize=(10,5))
pubs_per_year.plot(kind="bar", title="Publications per Year")
plt.xlabel("Year")
plt.ylabel("Number of Publications")
plt.show()

# --- Top Journals ---
top_journals = df_clean["journal"].value_counts().head(10)
plt.figure(figsize=(10,5))
top_journals.plot(kind="barh", title="Top Journals")
plt.xlabel("Number of Papers")
plt.ylabel("Journal")
plt.show()

# --- Word Cloud of Titles ---
titles = " ".join(df_clean["title"].dropna().tolist())
if titles:
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(titles)
    plt.figure(figsize=(12,6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Word Cloud of Paper Titles")
    plt.show()

# --- Distribution by Source ---
top_sources = df_clean["source_x"].value_counts().head(10)
plt.figure(figsize=(10,5))
top_sources.plot(kind="bar", title="Top Sources")
plt.xlabel("Source")
plt.ylabel("Paper Count")
plt.show()
