In [None]:
import pandas as pd
import datetime as dt
from collections import Counter
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Using forward slashes (recommended)
df = pd.read_csv("C:/Users/user/Documents/PLP_CLASS/PYTHON_CLASS/metadata.csv")

# Show dataframe
df.head()

# 1. Check DataFrame dimensions (rows, columns)
print("Shape (rows, columns):", df.shape)

# 2. Identify data types of each column
print("\nData types:")
print(df.dtypes)



# If you only care about specific "important" columns:
important_cols = ["authors", "journals"] # replace with your actual important columns
print("\nMissing values in important columns:")
try:
    print(df[important_cols].isnull().sum())

except KeyError as e:
    print(f"Error: {e}. Please check if the columns exist in the DataFrame.")    

# 4. Generate basic statistics for numerical columns
print("\nDescriptive statistics (numerical columns):")
print(df.describe())

# 5. Check for missing values in the entire DataFrame
missing = df.isnull().sum()
missing = missing[missing > 0]
print("Columns with missing values:\n", missing)

# Handling missing values
# Example strategies:
df_clean = df.dropna(axis=1, thresh=len(df)*0.7)
print("Columns after dropping those with >30% missing values:", df_clean.columns)

# Fill missing values
df_clean = df.fillna(0)          # for numeric columns
df_clean = df.fillna("Unknown")  # for categorical columns
df_clean = df.fillna(method="ffill")  # forward fill
df_clean = df.fillna(method="bfill")  # backward fill

# Verify no missing values remain
print("Missing values after cleaning:\n", df_clean.isnull().sum())

# Save cleaned data
subset = df_clean[["title", "publish_time", "authors", "journal", "abstract", "source_x"]]

# Assuming your cleaned DataFrame is df_clean
sample_df = df_clean.head(50000)
sample_df.to_csv("metadata_subset.csv", index=False)


# Display all the rows in the CSV
df = pd.read_csv("metadata_subset.csv")
pd.set_option("display.max_rows", None)  # show all rows
print(df)  # or just type df in Jupyter


# Convert to datetime format
df["publish_time"] = pd.to_datetime(df["publish_time"], errors="coerce")

df["PublicationYear"] = df["publish_time"].dt.year

# Count words in each abstract
df["abstractWordCount"] = df["abstract"].astype(str).apply(lambda x: len(x.split()))
df[["abstract", "abstractWordCount"]].head()

# Count number of publication per year
papers_per_year = df["PublicationYear"].value_counts().sort_index()
print(papers_per_year)
top_journals = df["journal"].value_counts().head(10)
print(top_journals)



# Join all titles into one string
all_titles = " ".join(df["title"].dropna().astype(str))

# Clean and split words (remove punctuation, lowercasing)
words = re.findall(r"\b\w+\b", all_titles.lower())

# Count frequencies
word_counts = Counter(words)

# Show top 20 most frequent words
print(word_counts.most_common(20))



# Count papers per year
papers_per_year = df["PublicationYear"].value_counts().sort_index()

plt.figure(figsize=(8,5))
papers_per_year.plot(kind="line", marker="o")
plt.title("Number of Publications Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Publications")
plt.grid(True, linestyle="--", alpha=0.6)

plt.savefig("publications_over_time.png", dpi=300, bbox_inches="tight")

plt.show()

top_journals = df["journal"].value_counts().head(10)

plt.figure(figsize=(10,6))
top_journals.plot(kind="bar", color="skyblue")
plt.title("Top 10 journals Publishing COVID-19 Research")
plt.xlabel("journal")
plt.ylabel("Number of Publications")
plt.xticks(rotation=45, ha="right")
plt.savefig("top_journals.png", dpi=300, bbox_inches="tight")
plt.show()



all_titles = " ".join(df["title"].dropna().astype(str))

wc = WordCloud(width=800, height=400, background_color="white").generate(all_titles)

plt.figure(figsize=(12,6))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Paper Titles")
plt.savefig("wordcloud_titles.png", dpi=300, bbox_inches="tight")
plt.show()

source_counts = df["source_x"].value_counts()

plt.figure(figsize=(10,6))
source_counts.plot(kind="bar", color="lightgreen")
plt.title("Distribution of Paper Counts by Source")
plt.xlabel("source_x")
plt.ylabel("Number of Publications")
plt.xticks(rotation=45, ha="right")
plt.savefig("source_distribution.png", dpi=300, bbox_inches="tight")
plt.show()
