In [None]:
analysis.ipynb

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from wordcloud import WordCloud
import nltk

# Download stopwords if not already
try:
    nltk.data.find('corpora/stopwords')
except:
    nltk.download('stopwords')
from nltk.corpus import stopwords

sns.set(style="whitegrid")
STOPWORDS = set(stopwords.words('english'))


In [None]:
# Load the cleaned dataset
DATA_PATH = "data/cleaned_sample.csv"
df = pd.read_csv(DATA_PATH, parse_dates=['publish_time'], low_memory=False)

print("Rows, Cols:", df.shape)
df.head()


In [None]:
print(df.info())
print("\nMissing values per column:")
print(df.isnull().sum())


In [None]:
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['year'] = df['publish_time'].dt.year
df['year'].head()


In [None]:
year_counts = df['year'].value_counts().sort_index()
plt.figure(figsize=(10,4))
year_counts.plot(kind='bar')
plt.title('Publications by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()


In [None]:
top_journals = df['journal'].fillna('Unknown').value_counts().head(20)
plt.figure(figsize=(10,6))
sns.barplot(x=top_journals.values, y=top_journals.index)
plt.title('Top 20 Journals')
plt.xlabel('Count')
plt.ylabel('Journal')
plt.show()


In [None]:
def common_words(series, n=20):
    text = " ".join(series.dropna().astype(str).str.lower().tolist())
    words = re.findall(r'\w+', text)
    words = [w for w in words if len(w) > 2 and w not in STOPWORDS]
    return Counter(words).most_common(n)

common_words(df['title'], n=20)


In [None]:
all_titles = " ".join(df['title'].dropna().astype(str).tolist())
wc = WordCloud(width=800, height=400, stopwords=STOPWORDS).generate(all_titles)

plt.figure(figsize=(12,6))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud: Paper Titles')
plt.show()
