In [12]:
pip install ydata-profiling


Collecting ydata-profiling
  Downloading ydata_profiling-4.12.2-py2.py3-none-any.whl.metadata (20 kB)
Collecting pydantic>=2 (from ydata-profiling)
  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting visions<0.8.0,>=0.7.5 (from visions[type_image_path]<0.8.0,>=0.7.5->ydata-profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting typeguard<5,>=3 (from ydata-profiling)
  Downloading typeguard-4.4.1-py3-none-any.whl.metadata (3.7 kB)
Collecting dacite>=1.8 (from ydata-profiling)
  Downloading dacite-1.8.1-py3-none-any.whl.metadata (15 kB)
Collecting annotated-types>=0.6.0 (from pydantic>=2->ydata-profiling)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.27.2 (from pydantic>=2->ydata-profiling)
  Downloading pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading ydata_profiling-4.12.2-py2.py3-none-any.whl (390 kB)
Downloading dacite-1.8.1-py3-none-any.whl (14 k

In [13]:
from docx import Document
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import BytesIO
from ydata_profiling import ProfileReport

# Load the data
df = pd.read_csv('../data/processed/merged_news.csv')

# Create a Document
doc = Document()

# Add Title
doc.add_heading('Exploratory Data Analysis (EDA) Report - Fake News Detection', 0)

# Add Introduction
doc.add_paragraph(
    "This is an exploratory data analysis (EDA) report for the Fake News Detection dataset. "
    "The dataset contains news articles labeled as either fake (0) or real (1). "
    "We aim to understand the distribution, relationships, and characteristics of the data."
)

# Basic Information about the Data
doc.add_heading('1. Basic Information', level=1)

doc.add_paragraph(f"Number of Rows: {df.shape[0]}")
doc.add_paragraph(f"Number of Columns: {df.shape[1]}")

# Add column names and types
doc.add_paragraph(f"Column Names: {', '.join(df.columns)}")
doc.add_paragraph(f"Data Types:\n{df.dtypes}")

# Basic Statistical Information
doc.add_heading('2. Descriptive Statistics', level=1)
doc.add_paragraph(str(df.describe()))

# Visualizations
doc.add_heading('3. Visualizations', level=1)

# Distribution of Labels
plt.figure(figsize=(6, 4))
sns.countplot(x='label', data=df, palette='viridis')
plt.title('Distribution of Real vs Fake News')
plt.tight_layout()

# Save the plot to a BytesIO object and add it to the Word document
img_stream = BytesIO()
plt.savefig(img_stream, format='png')
img_stream.seek(0)
doc.add_paragraph("Figure 1: Distribution of Real vs Fake News")
doc.add_picture(img_stream)
plt.close()

# Add a section for the profiling report (optional)
doc.add_heading('4. Profiling Report', level=1)

# Generate Profiling Report (Optional)
profile = ProfileReport(df, title="EDA Report - Fake News Detection", explorative=True)
profile.to_file('../reports/eda_report.html')  # Save as HTML

# Add reference to the profiling report
doc.add_paragraph("For more detailed insights, please refer to the automated profiling report: 'eda_report.html'.")

# Save the Document
doc.save('../reports/eda_report.docx')

print("EDA Report saved as 'eda_report.docx'.")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='label', data=df, palette='viridis')


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

EDA Report saved as 'eda_report.docx'.
